diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 802f0e0e5..5d8bcb9ec 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -80,11 +80,6 @@ DGEMVTKERNEL = ../mips/gemv_t.c CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -101,15 +96,19 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c new file mode 100644 index 000000000..cd1fa45b3 --- /dev/null +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -0,0 +1,2154 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + LD_SP2_INC(pb0, 4, src_b0, src_b1); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = (OP4 src_a0r) * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = (OP4 src_a1r) * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = (OP4 src_a0r) * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + res5_r OP0## = src_a1r * src_br; \ + res5_r OP1## = src_a1i * src_bi; \ + res5_i OP2## = (OP4 src_a1r) * src_bi; \ + res5_i OP3## = src_a1i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = (OP4 src_a0r) * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ + \ + res7_r OP0## = src_a1r * src_br; \ + res7_r OP1## = src_a1i * src_bi; \ + res7_i OP2## = (OP4 src_a1r) * src_bi; \ + res7_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + src_b0 = LD_SP(pb0); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = (OP4 src_a0r) * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = (OP4 src_a1r) * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + LD_SP2_INC(pb0, 4, src_b0, src_b1); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + src_b0 = LD_SP(pb0); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \ +{ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res4 OP0## = a0_r * b1_r; \ + res4 OP1## = a0_i * b1_i; \ + res5 OP2## = OP4 a0_r * b1_i; \ + res5 OP3## = a0_i * b1_r; \ + \ + res6 OP0## = a1_r * b1_r; \ + res6 OP1## = a1_i * b1_i; \ + res7 OP2## = OP4 a1_r * b1_i; \ + res7 OP3## = a1_i * b1_r; \ + \ + /* 2nd col */ \ + b2_r = pb0[4]; \ + b2_i = pb0[5]; \ + res8 OP0## = a0_r * b2_r; \ + res8 OP1## = a0_i * b2_i; \ + res9 OP2## = OP4 a0_r * b2_i; \ + res9 OP3## = a0_i * b2_r; \ + \ + res10 OP0## = a1_r * b2_r; \ + res10 OP1## = a1_i * b2_i; \ + res11 OP2## = OP4 a1_r * b2_i; \ + res11 OP3## = a1_i * b2_r; \ + \ + /* 3rd col */ \ + b3_r = pb0[6]; \ + b3_i = pb0[7]; \ + res12 OP0## = a0_r * b3_r; \ + res12 OP1## = a0_i * b3_i; \ + res13 OP2## = OP4 a0_r * b3_i; \ + res13 OP3## = a0_i * b3_r; \ + \ + res14 OP0## = a1_r * b3_r; \ + res14 OP1## = a1_i * b3_i; \ + res15 OP2## = OP4 a1_r * b3_i; \ + res15 OP3## = a1_i * b3_r; \ +} + +#define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res4 OP0## = a0_r * b1_r; \ + res4 OP1## = a0_i * b1_i; \ + res5 OP2## = OP4 a0_r * b1_i; \ + res5 OP3## = a0_i * b1_r; \ + \ + res6 OP0## = a1_r * b1_r; \ + res6 OP1## = a1_i * b1_i; \ + res7 OP2## = OP4 a1_r * b1_i; \ + res7 OP3## = a1_i * b1_r; \ +} + +#define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ +} + +#define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res2 OP0## = a0_r * b1_r; \ + res2 OP1## = a0_i * b1_i; \ + res3 OP2## = OP4 a0_r * b1_i; \ + res3 OP3## = a0_i * b1_r; \ + \ + /* 2nd col */ \ + b2_r = pb0[4]; \ + b2_i = pb0[5]; \ + res4 OP0## = a0_r * b2_r; \ + res4 OP1## = a0_i * b2_i; \ + res5 OP2## = OP4 a0_r * b2_i; \ + res5 OP3## = a0_i * b2_r; \ + \ + /* 3rd col */ \ + b3_r = pb0[6]; \ + b3_i = pb0[7]; \ + res6 OP0## = a0_r * b3_r; \ + res6 OP1## = a0_i * b3_i; \ + res7 OP2## = OP4 a0_r * b3_i; \ + res7 OP3## = a0_i * b3_r; \ +} + +#define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res2 OP0## = a0_r * b1_r; \ + res2 OP1## = a0_i * b1_i; \ + res3 OP2## = OP4 a0_r * b1_i; \ + res3 OP3## = a0_i * b1_r; \ +} + +#define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ +} + +#define CGEMM_SCALE_8X4_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ + \ + LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r += alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i += alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ + \ + LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r += alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i += alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ +} + +#define CGEMM_SCALE_8X2_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ +} + +#define CGEMM_SCALE_8X1_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ +} + +#define CGEMM_SCALE_4X4_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + LD_SP2(pc1, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ + \ + LD_SP2(pc2, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc2, 4); \ + \ + LD_SP2(pc3, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc3, 4); \ +} + +#define CGEMM_SCALE_4X2_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + LD_SP2(pc1, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ +} + +#define CGEMM_SCALE_4X1_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ +} + +#define CGEMM_SCALE_2X4 \ +{ \ + /* 0th col */ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] += alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] += alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] += alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] += alphar * res7; \ + pc1[3] += alphai * res6; \ + \ + /* 2nd col */ \ + pc2[0] += alphar * res8; \ + pc2[0] -= alphai * res9; \ + pc2[1] += alphar * res9; \ + pc2[1] += alphai * res8; \ + pc2[2] += alphar * res10; \ + pc2[2] -= alphai * res11; \ + pc2[3] += alphar * res11; \ + pc2[3] += alphai * res10; \ + \ + /* 3rd col */ \ + pc3[0] += alphar * res12; \ + pc3[0] -= alphai * res13; \ + pc3[1] += alphar * res13; \ + pc3[1] += alphai * res12; \ + pc3[2] += alphar * res14; \ + pc3[2] -= alphai * res15; \ + pc3[3] += alphar * res15; \ + pc3[3] += alphai * res14; \ +} + +#define CGEMM_SCALE_2X2 \ +{ \ + /* 0th col */ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] += alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] += alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] += alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] += alphar * res7; \ + pc1[3] += alphai * res6; \ +} + +#define CGEMM_SCALE_2X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ +} + +#define CGEMM_SCALE_1X4 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[0] += alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] += alphar * res3; \ + pc1[1] += alphai * res2; \ + \ + pc2[0] += alphar * res4; \ + pc2[0] -= alphai * res5; \ + pc2[1] += alphar * res5; \ + pc2[1] += alphai * res4; \ + \ + pc3[0] += alphar * res6; \ + pc3[0] -= alphai * res7; \ + pc3[1] += alphar * res7; \ + pc3[1] += alphai * res6; \ +} + +#define CGEMM_SCALE_1X2 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[2] += alphar * res2; \ + pc1[2] -= alphai * res3; \ + pc1[3] += alphar * res3; \ + pc1[3] += alphai * res2; \ +} + +#define CGEMM_SCALE_1X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +#define CGEMM_TRMM_SCALE_8X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r = alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i = alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r = alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i = alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ +} + +#define CGEMM_TRMM_SCALE_8X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ +} + +#define CGEMM_TRMM_SCALE_8X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc2, 4); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc3, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ +} + +#define CGEMM_TRMM_SCALE_2X4 \ +{ \ + /* 0th col */ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] = alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] = alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] = alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] = alphar * res7; \ + pc1[3] += alphai * res6; \ + \ + /* 2nd col */ \ + pc2[0] = alphar * res8; \ + pc2[0] -= alphai * res9; \ + pc2[1] = alphar * res9; \ + pc2[1] += alphai * res8; \ + pc2[2] = alphar * res10; \ + pc2[2] -= alphai * res11; \ + pc2[3] = alphar * res11; \ + pc2[3] += alphai * res10; \ + \ + /* 3rd col */ \ + pc3[0] = alphar * res12; \ + pc3[0] -= alphai * res13; \ + pc3[1] = alphar * res13; \ + pc3[1] += alphai * res12; \ + pc3[2] = alphar * res14; \ + pc3[2] -= alphai * res15; \ + pc3[3] = alphar * res15; \ + pc3[3] += alphai * res14; \ +} + +#define CGEMM_TRMM_SCALE_2X2 \ +{ \ + /* 0th col */ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] = alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] = alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] = alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] = alphar * res7; \ + pc1[3] += alphai * res6; \ +} + +#define CGEMM_TRMM_SCALE_2X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ +} + +#define CGEMM_TRMM_SCALE_1X4 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[0] = alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] = alphar * res3; \ + pc1[1] += alphai * res2; \ + \ + pc2[0] = alphar * res4; \ + pc2[0] -= alphai * res5; \ + pc2[1] = alphar * res5; \ + pc2[1] += alphai * res4; \ + \ + pc3[0] = alphar * res6; \ + pc3[0] -= alphai * res7; \ + pc3[1] = alphar * res7; \ + pc3[1] += alphai * res6; \ +} + +#define CGEMM_TRMM_SCALE_1X2 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[2] = alphar * res2; \ + pc1[2] -= alphai * res3; \ + pc1[3] = alphar * res3; \ + pc1[3] += alphai * res2; \ +} + +#define CGEMM_TRMM_SCALE_1X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, + FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pa0, *pb0; + FLOAT res0, res1, res2, res3, res4, res5, res6, res7; + FLOAT res8, res9, res10, res11, res12, res13, res14, res15; + FLOAT a0_r, a1_r; + FLOAT a0_i, a1_i; + FLOAT b0_r, b1_r, b2_r, b3_r; + FLOAT b0_i, b1_i, b2_i, b3_i; + v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1; + v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; + v4f32 dst0, dst1, dst2, dst3; + v4f32 alpha_r, alpha_i; + v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; + v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; + v4f32 dst0_r, dst0_i, dst1_r, dst1_i; + + alpha_r = COPY_FLOAT_TO_VECTOR(alphar); + alpha_i = COPY_FLOAT_TO_VECTOR(alphai); + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + pc2 = pc1 + 2 * ldc; + pc3 = pc2 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X4_MSA +#else + CGEMM_SCALE_8X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X4_MSA +#else + CGEMM_SCALE_4X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X4(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X4(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X4(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X4(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 8; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X4(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X4(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X4(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X4(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 8; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X4 +#else + CGEMM_SCALE_2X4 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X4(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X4(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X4(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X4(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 8; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X4(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X4(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X4(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X4(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 8; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X4 +#else + CGEMM_SCALE_1X4 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = k << 3; + B = B + l; + i = ldc << 3; + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X2_MSA(, -, , -, -); +#endif + + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X2_MSA(+, -, -, -,); +#endif + + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X2_MSA +#else + CGEMM_SCALE_8X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X2_MSA(, -, , -, -); +#endif + + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X2_MSA(+, -, -, -,); +#endif + + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X2_MSA +#else + CGEMM_SCALE_4X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X2(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X2(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X2(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X2(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X2(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X2(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X2(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X2(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X2 +#else + CGEMM_SCALE_2X2 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + pc1 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X2(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X2(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X2(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X2(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X2(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X2(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X2(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X2(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X2 +#else + CGEMM_SCALE_1X2 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + l = k << 2; + B = B + l; + i = ldc << 2; + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X1_MSA +#else + CGEMM_SCALE_8X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X1_MSA +#else + CGEMM_SCALE_4X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X1(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X1(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X1 +#else + CGEMM_SCALE_2X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X1(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X1(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X1 +#else + CGEMM_SCALE_1X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + + l = k << 1; + B = B + l; + i = ldc << 1; + C = C + i; + } + + return 0; +} diff --git a/kernel/mips/cgemm_ncopy_4_msa.c b/kernel/mips/cgemm_ncopy_4_msa.c new file mode 100644 index 000000000..b38290b3d --- /dev/null +++ b/kernel/mips/cgemm_ncopy_4_msa.c @@ -0,0 +1,195 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 dst0, dst1, dst4, dst5; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + pdst += 8; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + psrc1 += 2; + psrc2 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + pdst += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst); + pdst += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + psrc1 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + pdst += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_ncopy_8_msa.c b/kernel/mips/cgemm_ncopy_8_msa.c new file mode 100644 index 000000000..9ea749069 --- /dev/null +++ b/kernel/mips/cgemm_ncopy_8_msa.c @@ -0,0 +1,310 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07; + FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14; + FLOAT ctemp15, ctemp16; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + ILVRL_D2_SP(src10, src8, dst2, dst6); + ILVRL_D2_SP(src14, src12, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + ILVRL_D2_SP(src11, src9, dst2, dst6); + ILVRL_D2_SP(src15, src13, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + src8 = LD_SP(psrc5); + src10 = LD_SP(psrc6); + src12 = LD_SP(psrc7); + src14 = LD_SP(psrc8); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + psrc5 += 4; + psrc6 += 4; + psrc7 += 4; + psrc8 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + ILVRL_D2_SP(src10, src8, dst2, dst6); + ILVRL_D2_SP(src14, src12, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + ctemp09 = *(psrc5 + 0); + ctemp10 = *(psrc5 + 1); + ctemp11 = *(psrc6 + 0); + ctemp12 = *(psrc6 + 1); + ctemp13 = *(psrc7 + 0); + ctemp14 = *(psrc7 + 1); + ctemp15 = *(psrc8 + 0); + ctemp16 = *(psrc8 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + psrc5 += 2; + psrc6 += 2; + psrc7 += 2; + psrc8 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + *(pdst + 8) = ctemp09; + *(pdst + 9) = ctemp10; + *(pdst + 10) = ctemp11; + *(pdst + 11) = ctemp12; + *(pdst + 12) = ctemp13; + *(pdst + 13) = ctemp14; + *(pdst + 14) = ctemp15; + *(pdst + 15) = ctemp16; + pdst += 16; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + pdst += 8; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + psrc1 += 2; + psrc2 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + pdst += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst); + pdst += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + psrc1 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + pdst += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_tcopy_4_msa.c b/kernel/mips/cgemm_tcopy_4_msa.c new file mode 100644 index 000000000..12aaa979e --- /dev/null +++ b/kernel/mips/cgemm_tcopy_4_msa.c @@ -0,0 +1,125 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0; + FLOAT *psrc1, *psrc2; + FLOAT *pdst0; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + v4f32 src0, src1, src2, src3; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 8; + + for (i = (m >> 1); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP2(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst0, 4); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 4; + + for (i = (m >> 1); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + ST_SP2_INC(src0, src1, pdst0, 4); + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + src0 = LD_SP(psrc1); + ST_SP(src0, pdst0); + pdst0 += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 2; + + for (i = (m >> 1); i--;) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + pdst0 += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_tcopy_8_msa.c b/kernel/mips/cgemm_tcopy_8_msa.c new file mode 100644 index 000000000..9f78fa73a --- /dev/null +++ b/kernel/mips/cgemm_tcopy_8_msa.c @@ -0,0 +1,214 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *pdst0; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 16; + + for (i = (m >> 2); i--;) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + LD_SP4(psrc2, 4, src4, src5, src6, src7); + LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11); + LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15); + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); + ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4); + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + LD_SP4(psrc2, 4, src4, src5, src6, src7); + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 8; + + for (i = (m >> 2); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + LD_SP2(psrc1 + 2 * lda, 4, src4, src5); + LD_SP2(psrc2 + 2 * lda, 4, src6, src7); + + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + ST_SP4_INC(src4, src5, src6, src7, pdst0, 4); + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP2(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst0, 4); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 4; + + for (i = (m >> 2); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc1 + 2 * lda); + src3 = LD_SP(psrc2 + 2 * lda); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + ST_SP2_INC(src0, src1, pdst0, 4); + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + src0 = LD_SP(psrc1); + ST_SP(src0, pdst0); + pdst0 += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 2; + + for (i = (m >> 2); i--;) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 2) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + pdst0 += 2; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c index 1f0a2aee6..9286e7469 100644 --- a/kernel/mips/dgemm_kernel_8x4_msa.c +++ b/kernel/mips/dgemm_kernel_8x4_msa.c @@ -35,19 +35,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, #endif ) { - BLASLONG i, j, l; - FLOAT *pc0, *pc1, *pc2, *pc3; - FLOAT *pa0, *pb0; + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3; - FLOAT a0; - FLOAT b0, b1, b2, b3; + FLOAT a0, b0, b1, b2, b3; v2f64 v_alpha = {alpha, alpha}; v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 res0, res1, res2, res3, res4, res5, res6, res7; v2f64 res8, res9, res10, res11, res12, res13, res14, res15; - for (j = (n / 4); j--;) +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) { pc0 = C; pc1 = pc0 + ldc; @@ -56,12 +61,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 = A; - for (i = (m / 8); i--;) - { - pb0 = B; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -87,13 +114,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 = src_a2 * src_b; res15 = src_a3 * src_b; - pa0 += 8; - pb0 += 4; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -119,11 +143,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 += src_a2 * src_b; res15 += src_a3 * src_b; - pa0 += 8; - pb0 += 4; - - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -148,15 +169,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res13 += src_a1 * src_b; res14 += src_a2 * src_b; res15 += src_a3 * src_b; - - pa0 += 8; - pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -181,11 +199,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res13 += src_a1 * src_b; res14 += src_a2 * src_b; res15 += src_a3 * src_b; - - pa0 += 8; - pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); @@ -197,10 +222,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); - ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); - ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); - +#if defined(TRMMKERNEL) + dst0 = res8 * v_alpha; + dst1 = res9 * v_alpha; + dst2 = res10 * v_alpha; + dst3 = res11 * v_alpha; + dst4 = res12 * v_alpha; + dst5 = res13 * v_alpha; + dst6 = res14 * v_alpha; + dst7 = res15 * v_alpha; +#else LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); @@ -212,22 +247,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; +#endif - ST_DP4(dst0, dst1, dst2, dst3, pc2, 2); - ST_DP4(dst4, dst5, dst6, dst7, pc3, 2); + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); - pc0 += 8; - pc1 += 8; - pc2 += 8; - pc3 += 8; +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 4; +#endif - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -245,13 +311,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 = src_a0 * src_b; res7 = src_a1 * src_b; - pa0 += 4; - pb0 += 4; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -269,11 +332,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 4; - pb0 += 4; - - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -290,15 +350,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 += src_a0 * src_b; res7 += src_a1 * src_b; - - pa0 += 4; - pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -315,11 +372,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 += src_a0 * src_b; res7 += src_a1 * src_b; - - pa0 += 4; - pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); LD_DP2(pc2, 2, dst4, dst5); @@ -333,24 +397,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + ST_DP2_INC(dst2, dst3, pc1, 2); + ST_DP2_INC(dst4, dst5, pc2, 2); + ST_DP2_INC(dst6, dst7, pc3, 2); - ST_DP2(dst0, dst1, pc0, 2); - ST_DP2(dst2, dst3, pc1, 2); - ST_DP2(dst4, dst5, pc2, 2); - ST_DP2(dst6, dst7, pc3, 2); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 4; +#endif - pc0 += 4; - pc1 += 4; - pc2 += 4; - pc3 += 4; +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -364,13 +459,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 = src_a0 * src_b; - pa0 += 2; - pb0 += 4; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -384,11 +477,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; - pa0 += 2; - pb0 += 4; - src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -401,15 +492,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; - - pa0 += 2; - pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -422,11 +511,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; - - pa0 += 2; - pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst2 = LD_DP(pc2); @@ -436,21 +528,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); ST_DP(dst2, pc2); ST_DP(dst3, pc3); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -468,7 +594,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -503,7 +629,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -527,10 +653,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; +#else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; @@ -538,25 +688,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc3 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + l = (k << 2); B = B + l; i = (ldc << 2); C = C + i; } - for (j = ((n & 2) / 2); j--;) + if (n & 2) { pc0 = C; pc1 = pc0 + ldc; pa0 = A; - for (i = (m / 8); i--;) - { - pb0 = B; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -570,13 +748,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 = src_a2 * src_b; res7 = src_a3 * src_b; - pa0 += 8; - pb0 += 2; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -590,11 +766,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a2 * src_b; res7 += src_a3 * src_b; - pa0 += 8; - pb0 += 2; - - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -607,15 +781,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; - - pa0 += 8; - pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -628,11 +800,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; - - pa0 += 8; - pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); @@ -644,20 +823,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); - ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); - ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 2; +#endif - pc0 += 8; - pc1 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 2; +#endif - LD_DP2(pa0, 2, src_a0, src_a1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -667,13 +879,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 = src_a0 * src_b; res3 = src_a1 * src_b; - pa0 += 4; - pb0 += 2; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -683,11 +893,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 4; - pb0 += 2; - - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -696,15 +904,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; - - pa0 += 4; - pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -713,11 +919,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; - - pa0 += 4; - pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); @@ -725,20 +934,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + ST_DP2_INC(dst2, dst3, pc1, 2); - ST_DP2(dst0, dst1, pc0, 2); - ST_DP2(dst2, dst3, pc1, 2); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 2; +#endif - pc0 += 4; - pc1 += 4; +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -746,13 +989,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 = src_a0 * src_b; - pa0 += 2; - pb0 += 2; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -760,53 +1002,86 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; - pa0 += 2; - pb0 += 2; - src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; - - pa0 += 2; - pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; - - pa0 += 2; - pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; - +#endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -818,7 +1093,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -841,7 +1116,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -857,29 +1132,77 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; +#else pc0[0] += tmp0; pc1[0] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + l = (k << 1); B = B + l; i = (ldc << 1); C = C + i; } - for (j = (n & 1); j--;) + if (n & 1) { pc0 = C; pa0 = A; - for (i = (m / 8); i--;) - { - pb0 = B; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -888,12 +1211,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 = src_a2 * src_b; res3 = src_a3 * src_b; - pa0 += 8; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -902,10 +1224,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a2 * src_b; res3 += src_a3 * src_b; - pa0 += 8; pb0 += 1; - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -914,13 +1235,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a2 * src_b; res3 += src_a3 * src_b; - pa0 += 8; pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -929,85 +1249,156 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a2 * src_b; res3 += src_a3 * src_b; - pa0 += 8; pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); - ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 1; +#endif - pc0 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 1; +#endif - LD_DP2(pa0, 2, src_a0, src_a1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 = src_a0 * src_b; res1 = src_a1 * src_b; - pa0 += 4; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 4; pb0 += 1; - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 4; pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 4; pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); - ST_DP2(dst0, dst1, pc0, 2); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 1; +#endif - pc0 += 4; +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1018,7 +1409,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1039,7 +1430,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1051,18 +1442,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; +#else dst0 = LD_DP(pc0); dst0 += res0 * v_alpha; - +#endif ST_DP(dst0, pc0); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1071,7 +1499,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1088,7 +1516,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1098,15 +1526,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + pc0[0] = alpha * tmp0; +#else pc0[0] += alpha * tmp0; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + l = (k << 0); B = B + l; i = (ldc << 0); C = C + i; } + return 0; } diff --git a/kernel/mips/dgemm_ncopy_4_msa.c b/kernel/mips/dgemm_ncopy_4_msa.c index bbd76070f..a61b2e806 100644 --- a/kernel/mips/dgemm_ncopy_4_msa.c +++ b/kernel/mips/dgemm_ncopy_4_msa.c @@ -32,8 +32,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; - FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *pdst; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -50,28 +49,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src3, src1, dst2, dst6); + ILVRL_D2_DP(src7, src5, dst3, dst7); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); - - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - - ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); - pdst += 16; + ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); } for (i = (m & 3); i--;) @@ -91,18 +79,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - psrc1 += 4; - psrc2 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src3, src1, dst1, dst5); - ST_DP4(dst0, dst4, dst1, dst5, pdst, 2); - pdst += 8; + ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2); } for (i = (m & 3); i--;) diff --git a/kernel/mips/dgemm_ncopy_8_msa.c b/kernel/mips/dgemm_ncopy_8_msa.c index 43c977582..86d019c4f 100644 --- a/kernel/mips/dgemm_ncopy_8_msa.c +++ b/kernel/mips/dgemm_ncopy_8_msa.c @@ -32,9 +32,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; - FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *psrc5, *psrc6, *psrc7, *psrc8; - FLOAT *pdst; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -56,80 +55,51 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 3); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - LD_DP2(psrc5, 2, src8, src9); - LD_DP2(psrc6, 2, src10, src11); - LD_DP2(psrc7, 2, src12, src13); - LD_DP2(psrc8, 2, src14, src15); + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src10, src8, dst2, dst6); + ILVRL_D2_DP(src14, src12, dst3, dst7); - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); + ILVRL_D2_DP(src3, src1, dst0, dst4); + ILVRL_D2_DP(src7, src5, dst1, dst5); + ILVRL_D2_DP(src11, src9, dst2, dst6); + ILVRL_D2_DP(src15, src13, dst3, dst7); - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16, - 2); + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); - LD_DP2(psrc1 + 4, 2, src0, src1); - LD_DP2(psrc2 + 4, 2, src2, src3); - LD_DP2(psrc3 + 4, 2, src4, src5); - LD_DP2(psrc4 + 4, 2, src6, src7); - LD_DP2(psrc5 + 4, 2, src8, src9); - LD_DP2(psrc6 + 4, 2, src10, src11); - LD_DP2(psrc7 + 4, 2, src12, src13); - LD_DP2(psrc8 + 4, 2, src14, src15); + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src10, src8, dst2, dst6); + ILVRL_D2_DP(src14, src12, dst3, dst7); - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32, - 2); + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); + ILVRL_D2_DP(src3, src1, dst0, dst4); + ILVRL_D2_DP(src7, src5, dst1, dst5); + ILVRL_D2_DP(src11, src9, dst2, dst6); + ILVRL_D2_DP(src15, src13, dst3, dst7); - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48, - 2); - - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; - pdst += 64; + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); } for (i = (m & 7); i--;) @@ -155,27 +125,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src3, src1, dst2, dst6); + ILVRL_D2_DP(src7, src5, dst3, dst7); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - - ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); - pdst += 16; + ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); } for (i = (m & 3); i--;) @@ -200,11 +160,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 2; psrc2 += 2; - dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0); - dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0); + ILVRL_D2_DP(src1, src0, dst0, dst1); - ST_DP2(dst0, dst1, pdst, 2); - pdst += 4; + ST_DP2_INC(dst0, dst1, pdst, 2); } if (m & 1) diff --git a/kernel/mips/dgemm_tcopy_4_msa.c b/kernel/mips/dgemm_tcopy_4_msa.c index f147d190e..a51c47429 100644 --- a/kernel/mips/dgemm_tcopy_4_msa.c +++ b/kernel/mips/dgemm_tcopy_4_msa.c @@ -55,14 +55,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); pdst1 += m * 4; @@ -79,8 +75,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc3 += 2; psrc4 += 2; - ST_DP4(src0, src1, src2, src3, pdst2, 2); - pdst2 += 8; + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); } if (n & 1) @@ -103,10 +98,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - psrc1 += 4; - psrc2 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); ST_DP4(src0, src1, src2, src3, pdst1, 2); pdst1 += m * 4; @@ -119,8 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 2; psrc2 += 2; - ST_DP2(src0, src1, pdst2, 2); - pdst2 += 4; + ST_DP2_INC(src0, src1, pdst2, 2); } if (n & 1) @@ -137,8 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - psrc1 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); ST_DP2(src0, src1, pdst1, 2); pdst1 += 4 * m; diff --git a/kernel/mips/dgemm_tcopy_8_msa.c b/kernel/mips/dgemm_tcopy_8_msa.c index d1ac49b5a..350ecb359 100644 --- a/kernel/mips/dgemm_tcopy_8_msa.c +++ b/kernel/mips/dgemm_tcopy_8_msa.c @@ -62,27 +62,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - LD_DP4(psrc2, 2, src4, src5, src6, src7); - LD_DP4(psrc3, 2, src8, src9, src10, src11); - LD_DP4(psrc4, 2, src12, src13, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, pdst1 + 16, 2); - LD_DP4(psrc5, 2, src0, src1, src2, src3); - LD_DP4(psrc6, 2, src4, src5, src6, src7); - LD_DP4(psrc7, 2, src8, src9, src10, src11); - LD_DP4(psrc8, 2, src12, src13, src14, src15); - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_DP4_INC(psrc5, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc6, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc7, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc8, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, 2); @@ -93,27 +85,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - LD_DP2(psrc5, 2, src8, src9); - LD_DP2(psrc6, 2, src10, src11); - LD_DP2(psrc7, 2, src12, src13); - LD_DP2(psrc8, 2, src14, src15); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; - psrc5 += 4; - psrc6 += 4; - psrc7 += 4; - psrc8 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); - ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); - ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, - pdst2 + 16, 2); - pdst2 += 32; + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, + pdst2, 2); } if (n & 2) @@ -135,8 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc7 += 2; psrc8 += 2; - ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); - pdst3 += 16; + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); } if (n & 1) @@ -165,18 +147,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - LD_DP4(psrc2, 2, src4, src5, src6, src7); - LD_DP4(psrc3, 2, src8, src9, src10, src11); - LD_DP4(psrc4, 2, src12, src13, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, @@ -186,17 +160,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); - ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); - pdst2 += 16; + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); } if (n & 2) @@ -210,8 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc3 += 2; psrc4 += 2; - ST_DP4(src0, src1, src2, src3, pdst3, 2); - pdst3 += 8; + ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); } if (n & 1) @@ -234,10 +202,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - LD_DP4(psrc2, 2, src4, src5, src6, src7); - psrc1 += 8; - psrc2 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); pdst1 += 8 * m; @@ -245,13 +211,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - psrc1 += 4; - psrc2 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); - ST_DP4(src0, src1, src2, src3, pdst2, 2); - pdst2 += 8; + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); } if (n & 2) @@ -261,8 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 2; psrc2 += 2; - ST_DP2(src0, src1, pdst3, 2); - pdst3 += 4; + ST_DP2_INC(src0, src1, pdst3, 2); } if (n & 1) @@ -282,8 +244,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - psrc1 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); ST_DP4(src0, src1, src2, src3, pdst1, 2); pdst1 += 8 * m; @@ -291,11 +252,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - psrc1 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); - ST_DP2(src0, src1, pdst2, 2); - pdst2 += 4; + ST_DP2_INC(src0, src1, pdst2, 2); } if (n & 2) diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index 0efca7860..dbc185302 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -42,10 +42,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) -#define COPY_FLOAT_TO_VECTOR(a, b) \ - b = __msa_cast_to_vector_float(a); \ - b = (v4f32) __msa_splati_w((v4i32) b, 0); +#define COPY_FLOAT_TO_VECTOR(a) ( { \ + v4f32 out; \ + out = __msa_cast_to_vector_float(a); \ + out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + out; \ +} ) +#define COPY_DOUBLE_TO_VECTOR(a) ( { \ + v2f64 out; \ + out = __msa_cast_to_vector_double(a); \ + out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + out; \ +} ) + +/* Description : Load 2 variables with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 +*/ +#define LD_GP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = *(psrc); \ + (psrc) += stride; \ + out1 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + out2 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_GP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + out4 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + LD_GP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_GP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + LD_GP2_INC(psrc, stride, out4, out5); \ + out6 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \ +} /* Description : Load 2 vectors of single precision floating point elements with stride Arguments : Inputs - psrc, stride @@ -58,6 +130,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out1 = LD_SP((psrc) + stride); \ } +#define LD_SP4(psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_SP2(psrc, stride, out0, out1) \ + LD_SP2(psrc + 2 * stride, stride, out2, out3) \ +} + +#define LD_SP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = LD_SP((psrc)); \ + (psrc) += stride; \ + out1 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + out2 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_SP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + out4 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + LD_SP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_SP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + LD_SP2_INC(psrc, stride, out4, out5); \ + out6 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \ +} + +#define LD_SP16_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7, out8, \ + out9, out10, out11, out12, out13, \ + out14, out15) \ +{ \ + LD_SP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7); \ + LD_SP8_INC(psrc, stride, out8, out9, out10, \ + out11, out12, out13, out14, out15); \ +} + /* Description : Load 2 vectors of double precision floating point elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -75,6 +223,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LD_DP2(psrc + 2 * stride, stride, out2, out3) \ } +#define LD_DP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = LD_DP(psrc); \ + (psrc) += stride; \ + out1 = LD_DP(psrc); \ + (psrc) += stride; \ +} + +#define LD_DP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + out2 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_DP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + out4 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + LD_DP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_DP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + LD_DP2_INC(psrc, stride, out4, out5); \ + out6 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \ +} + +#define LD_DP16_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7, out8, \ + out9, out10, out11, out12, out13, \ + out14, out15) \ +{ \ + LD_DP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7); \ + LD_DP8_INC(psrc, stride, out8, out9, out10, \ + out11, out12, out13, out14, out15); \ +} + +/* Description : Store GP variable with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 single precision floating point elements from 'in0' to (pdst) + Store 4 single precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_GP2_INC(in0, in1, \ + pdst, stride) \ +{ \ + *(pdst) = in0; \ + (pdst) += stride; \ + *(pdst) = in1; \ + (pdst) += stride; \ +} + +#define ST_GP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + *(pdst) = in2; \ + (pdst) += stride; \ +} + +#define ST_GP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_GP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + *(pdst) = in4; \ + (pdst) += stride; \ +} + +#define ST_GP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + ST_GP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_GP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + ST_GP2_INC(in4, in5, pdst, stride); \ + *(pdst) = in6; \ + (pdst) += stride; \ +} + +#define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + /* Description : Store vectors of single precision floating point elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 4 single precision floating point elements from 'in0' to (pdst) @@ -98,6 +379,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \ } +#define ST_SP2_INC(in0, in1, pdst, stride) \ +{ \ + ST_SP(in0, (pdst)); \ + (pdst) += stride; \ + ST_SP(in1, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP(in2, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_SP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP(in4, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_SP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP2_INC(in4, in5, pdst, stride); \ + ST_SP(in6, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + +#define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, in8, in9, in10, in11, in12, \ + in13, in14, in15, pdst, stride) \ +{ \ + ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, pdst, stride); \ + ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \ + in15, pdst, stride); \ +} + /* Description : Store vectors of double precision floating point elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 2 double precision floating point elements from 'in0' to (pdst) @@ -121,6 +469,104 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } +#define ST_DP2_INC(in0, in1, pdst, stride) \ +{ \ + ST_DP(in0, (pdst)); \ + (pdst) += stride; \ + ST_DP(in1, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP(in2, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_DP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP(in4, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_DP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP2_INC(in4, in5, pdst, stride); \ + ST_DP(in6, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + +#define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, in8, in9, in10, in11, in12, \ + in13, in14, in15, pdst, stride) \ +{ \ + ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, pdst, stride); \ + ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \ + in15, pdst, stride); \ +} + +/* Description : shuffle elements in vector as shf_val + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE +*/ +#define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \ +{ \ + out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ + out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ +} +#define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__) +#define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__) + +#define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \ + shf_val) \ +{ \ + out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ + out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ + out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \ +} +#define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__) + +#define SHF_W4(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3, shf_val) \ +{ \ + SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \ + SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \ +} +#define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__) +#define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__) + /* Description : Interleave both left and right half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 @@ -134,12 +580,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ } #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) +#define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__) #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ } +#define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__) #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) /* Description : Indexed word element values are replicated to all @@ -158,6 +606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ } +#define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__) #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ { \ @@ -166,22 +615,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) +#define SPLATI_D2(RTYPE, in, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \ + out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \ +} +#define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' are copied to the left half + of 'out0' & even double word elements of 'in1' are copied to + the right half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ +} +#define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__) +#define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__) + +#define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ + out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \ +} +#define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__) + +/* Description : pack both even and odd half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are copied to the + 'out0' & odd double word elements of 'in0' and 'in1' are + copied to the 'out1'. +*/ +#define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \ +} +#define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__) + +#define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ +} +#define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} +#define MUL3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + out2 = in4 * in5; \ +} +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Addition of 2 pairs of variables + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + out2 = in4 + in5; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + /* Description : Transpose 4x4 block with word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - as per RTYPE */ -#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ -{ \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ - ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ - \ - out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \ - out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \ - out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ - out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ +#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3) \ +{ \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \ + ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \ } #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) diff --git a/kernel/mips/sgemm_kernel_8x8_msa.c b/kernel/mips/sgemm_kernel_8x8_msa.c index 611ebabac..1695471ad 100644 --- a/kernel/mips/sgemm_kernel_8x8_msa.c +++ b/kernel/mips/sgemm_kernel_8x8_msa.c @@ -35,20 +35,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, #endif ) { - BLASLONG i, j, l; + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; FLOAT *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; - FLOAT a0, a1; - FLOAT b0, b1, b2, b3, b4, b5, b6, b7; + FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7; v4f32 v_alpha = {alpha, alpha, alpha, alpha}; v4f32 src_a0, src_a1, src_b, src_b0, src_b1; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v4f32 res0, res1, res2, res3, res4, res5, res6, res7; v4f32 res8, res9, res10, res11, res12, res13, res14, res15; - for (j = (n / 8); j--;) +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 3); j--;) { pc0 = C; pc1 = pc0 + ldc; @@ -59,13 +65,35 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc6 = pc5 + ldc; pc7 = pc6 + ldc; - pa0 = A; - for (i = (m / 8); i--;) - { - pb0 = B; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); + pa0 = A; + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; @@ -99,13 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 = src_a0 * src_b; res15 = src_a1 * src_b; - pa0 += 8; - pb0 += 8; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -139,11 +164,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 += src_a0 * src_b; res15 += src_a1 * src_b; - pa0 += 8; - pb0 += 8; - - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -176,15 +198,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; - - pa0 += 8; - pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -217,11 +236,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; - - pa0 += 8; - pb0 += 8; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); @@ -235,12 +261,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + ST_SP2_INC(dst4, dst5, pc2, 4); + ST_SP2_INC(dst6, dst7, pc3, 4); - ST_SP2(dst0, dst1, pc0, 4); - ST_SP2(dst2, dst3, pc1, 4); - ST_SP2(dst4, dst5, pc2, 4); - ST_SP2(dst6, dst7, pc3, 4); - +#if defined(TRMMKERNEL) + dst0 = res8 * v_alpha; + dst1 = res9 * v_alpha; + dst2 = res10 * v_alpha; + dst3 = res11 * v_alpha; + dst4 = res12 * v_alpha; + dst5 = res13 * v_alpha; + dst6 = res14 * v_alpha; + dst7 = res15 * v_alpha; +#else LD_SP2(pc4, 4, dst0, dst1); LD_SP2(pc5, 4, dst2, dst3); LD_SP2(pc6, 4, dst4, dst5); @@ -254,28 +290,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc4, 4); + ST_SP2_INC(dst2, dst3, pc5, 4); + ST_SP2_INC(dst4, dst5, pc6, 4); + ST_SP2_INC(dst6, dst7, pc7, 4); - ST_SP2(dst0, dst1, pc4, 4); - ST_SP2(dst2, dst3, pc5, 4); - ST_SP2(dst4, dst5, pc6, 4); - ST_SP2(dst6, dst7, pc7, 4); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 8; +#endif - pc0 += 8; - pc1 += 8; - pc2 += 8; - pc3 += 8; - pc4 += 8; - pc5 += 8; - pc6 += 8; - pc7 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; @@ -302,12 +364,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 = src_a0 * src_b; pa0 += 4; - pb0 += 8; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -334,10 +395,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 += src_a0 * src_b; pa0 += 4; - pb0 += 8; src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -364,13 +424,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 += src_a0 * src_b; pa0 += 4; - pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -397,9 +456,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 += src_a0 * src_b; pa0 += 4; - pb0 += 8; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); @@ -409,12 +473,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); +#if defined(TRMMKERNEL) + dst0 = res4 * v_alpha; + dst1 = res5 * v_alpha; + dst2 = res6 * v_alpha; + dst3 = res7 * v_alpha; +#else dst0 = LD_SP(pc4); dst1 = LD_SP(pc5); dst2 = LD_SP(pc6); @@ -424,12 +494,29 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res5 * v_alpha; dst2 += res6 * v_alpha; dst3 += res7 * v_alpha; - +#endif ST_SP(dst0, pc4); ST_SP(dst1, pc5); ST_SP(dst2, pc6); ST_SP(dst3, pc7); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + pc0 += 4; pc1 += 4; pc2 += 4; @@ -440,9 +527,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc7 += 4; } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -482,7 +587,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 8; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -561,7 +666,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -611,6 +716,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp12 = alpha * tmp12; tmp14 = alpha * tmp14; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc2[0] = tmp4; + pc3[0] = tmp6; + pc4[0] = tmp8; + pc5[0] = tmp10; + pc6[0] = tmp12; + pc7[0] = tmp14; +#else pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; @@ -619,7 +734,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc5[0] += tmp10; pc6[0] += tmp12; pc7[0] += tmp14; - +#endif tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; @@ -629,6 +744,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp13 = alpha * tmp13; tmp15 = alpha * tmp15; +#if defined(TRMMKERNEL) + pc0[1] = tmp1; + pc1[1] = tmp3; + pc2[1] = tmp5; + pc3[1] = tmp7; + pc4[1] = tmp9; + pc5[1] = tmp11; + pc6[1] = tmp13; + pc7[1] = tmp15; +#else pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; @@ -637,6 +762,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc5[1] += tmp11; pc6[1] += tmp13; pc7[1] += tmp15; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; @@ -648,9 +791,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc7 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -680,7 +841,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 8; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -739,14 +900,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; - tmp1 += a0 * b1; + tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; @@ -779,6 +940,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp6 = alpha * tmp6; tmp7 = alpha * tmp7; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; + pc4[0] = tmp4; + pc5[0] = tmp5; + pc6[0] = tmp6; + pc7[0] = tmp7; +#else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; @@ -787,7 +958,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc5[0] += tmp5; pc6[0] += tmp6; pc7[0] += tmp7; +#endif +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; pc2 += 1; @@ -798,13 +986,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc7 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + l = (k << 3); B = B + l; i = (ldc << 3); C = C + i; } - for (j = ((n & 4) / 4); j--;) + if (n & 4) { pc0 = C; pc1 = pc0 + ldc; @@ -813,11 +1005,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 = A; - for (i = (m / 8); i--;) - { - pb0 = B; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -836,12 +1050,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 = src_a0 * src_b; res7 = src_a1 * src_b; - pa0 += 8; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -860,10 +1073,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 8; pb0 += 4; - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -882,13 +1094,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 8; pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -907,10 +1118,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 8; pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); @@ -924,21 +1144,52 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif - ST_SP2(dst0, dst1, pc0, 4); - ST_SP2(dst2, dst3, pc1, 4); - ST_SP2(dst4, dst5, pc2, 4); - ST_SP2(dst6, dst7, pc3, 4); + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + ST_SP2_INC(dst4, dst5, pc2, 4); + ST_SP2_INC(dst6, dst7, pc3, 4); - pc0 += 8; - pc1 += 8; - pc2 += 8; - pc3 += 8; +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); @@ -958,7 +1209,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); @@ -997,7 +1248,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); @@ -1017,7 +1268,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 4; } - +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); @@ -1027,21 +1283,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif pc0 += 4; pc1 += 4; pc2 += 4; pc3 += 4; } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1065,7 +1355,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1112,7 +1402,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1142,20 +1432,50 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp4 = alpha * tmp4; tmp6 = alpha * tmp6; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc2[0] = tmp4; + pc3[0] = tmp6; +#else pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; pc3[0] += tmp6; - +#endif tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; tmp7 = alpha * tmp7; +#if defined(TRMMKERNEL) + pc0[1] = tmp1; + pc1[1] = tmp3; + pc2[1] = tmp5; + pc3[1] = tmp7; +#else pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; pc3[1] += tmp7; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; @@ -1163,9 +1483,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc3 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1183,7 +1521,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1218,7 +1556,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1242,35 +1580,84 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; +#else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; +#endif +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + l = (k << 2); B = B + l; i = (ldc << 2); C = C + i; } - for (j = ((n & 2) / 2); j--;) + if (n & 2) { pc0 = C; pc1 = pc0 + ldc; pa0 = A; - for (i = (m / 8); i--;) - { - pb0 = B; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1282,12 +1669,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 = src_a0 * src_b; res3 = src_a1 * src_b; - pa0 += 8; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1299,10 +1685,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 8; pb0 += 2; - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1314,13 +1699,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 8; pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1332,10 +1716,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 8; pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); @@ -1343,17 +1732,49 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); - ST_SP2(dst0, dst1, pc0, 4); - ST_SP2(dst2, dst3, pc1, 4); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 2; +#endif - pc0 += 8; - pc1 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1368,7 +1789,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1397,7 +1818,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1413,22 +1834,60 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; - +#endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif pc0 += 4; pc1 += 4; } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1444,7 +1903,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1475,7 +1934,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1493,24 +1952,64 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, } tmp0 = alpha * tmp0; - tmp2 = alpha * tmp2; - - pc0[0] += tmp0; - pc1[0] += tmp2; - tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc0[1] = tmp1; + pc1[1] = tmp3; +#else + pc0[0] += tmp0; + pc1[0] += tmp2; pc0[1] += tmp1; pc1[1] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1522,7 +2021,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1545,7 +2044,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1561,87 +2060,166 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; +#else pc0[0] += tmp0; pc1[0] += tmp1; +#endif +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif l = (k << 1); B = B + l; i = (ldc << 1); C = C + i; } - for (j = (n & 1); j--;) + if (n & 1) { pc0 = C; pa0 = A; - for (i = (m / 8); i--;) - { - pb0 = B; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; - pa0 += 8; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 8; pb0 += 1; - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 8; pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 8; pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); - ST_SP2(dst0, dst1, pc0, 4); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 1; +#endif - pc0 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1652,7 +2230,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1673,7 +2251,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1685,18 +2263,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; +#else dst0 = LD_SP(pc0); dst0 += res0 * v_alpha; - +#endif ST_SP(dst0, pc0); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif pc0 += 4; } - for (i = (m & 2) / 2; i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1708,7 +2323,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1731,7 +2346,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1744,18 +2359,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - tmp0 = alpha * tmp0; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc0[1] = tmp1; +#else pc0[0] += tmp0; - - tmp1 = alpha * tmp1; pc0[1] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1764,7 +2416,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1781,7 +2433,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1791,11 +2443,35 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + pc0[0] = alpha * tmp0; +#else pc0[0] += alpha * tmp0; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif l = (k << 0); B = B + l; i = (ldc << 0); diff --git a/kernel/mips/sgemm_ncopy_8_msa.c b/kernel/mips/sgemm_ncopy_8_msa.c index 71048f1c3..8618c4435 100644 --- a/kernel/mips/sgemm_ncopy_8_msa.c +++ b/kernel/mips/sgemm_ncopy_8_msa.c @@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, - FLOAT * __restrict dst) +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; - FLOAT *psrc0; - FLOAT *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *psrc5, *psrc6, *psrc7, *psrc8; - FLOAT *pdst; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -58,22 +55,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - LD_SP2(psrc3, 4, src4, src5); - LD_SP2(psrc4, 4, src6, src7); - LD_SP2(psrc5, 4, src8, src9); - LD_SP2(psrc6, 4, src10, src11); - LD_SP2(psrc7, 4, src12, src13); - LD_SP2(psrc8, 4, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6); TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5, @@ -83,15 +72,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13, dst15); - ST_SP2(dst0, dst1, pdst, 4); - ST_SP2(dst2, dst3, pdst + 8, 4); - ST_SP2(dst4, dst5, pdst + 16, 4); - ST_SP2(dst6, dst7, pdst + 24, 4); - ST_SP2(dst8, dst9, pdst + 32, 4); - ST_SP2(dst10, dst11, pdst + 40, 4); - ST_SP2(dst12, dst13, pdst + 48, 4); - ST_SP2(dst14, dst15, pdst + 56, 4); - pdst += 64; + ST_SP2_INC(dst0, dst1, pdst, 4); + ST_SP2_INC(dst2, dst3, pdst, 4); + ST_SP2_INC(dst4, dst5, pdst, 4); + ST_SP2_INC(dst6, dst7, pdst, 4); + ST_SP2_INC(dst8, dst9, pdst, 4); + ST_SP2_INC(dst10, dst11, pdst, 4); + ST_SP2_INC(dst12, dst13, pdst, 4); + ST_SP2_INC(dst14, dst15, pdst, 4); } for (i = (m & 7); i--;) @@ -128,9 +116,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3); - ST_SP2(dst0, dst1, pdst, 4); - ST_SP2(dst2, dst3, pdst + 8, 4); - pdst += 16; + ST_SP2_INC(dst0, dst1, pdst, 4); + ST_SP2_INC(dst2, dst3, pdst, 4); } for (i = (m & 3); i--;) diff --git a/kernel/mips/sgemm_tcopy_8_msa.c b/kernel/mips/sgemm_tcopy_8_msa.c index 7d4aecb4b..3542eca21 100644 --- a/kernel/mips/sgemm_tcopy_8_msa.c +++ b/kernel/mips/sgemm_tcopy_8_msa.c @@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, - FLOAT * __restrict dst) +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; - FLOAT *psrc0; - FLOAT *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *psrc5, *psrc6, *psrc7, *psrc8; - FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; @@ -63,22 +60,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - LD_SP2(psrc3, 4, src4, src5); - LD_SP2(psrc4, 4, src6, src7); - LD_SP2(psrc5, 4, src8, src9); - LD_SP2(psrc6, 4, src10, src11); - LD_SP2(psrc7, 4, src12, src13); - LD_SP2(psrc8, 4, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15, @@ -105,8 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc7 += 4; psrc8 += 4; - ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); - pdst2 += 32; + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); } if (n & 2) @@ -155,14 +143,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - LD_SP2(psrc3, 4, src4, src5); - LD_SP2(psrc4, 4, src6, src7); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); pdst1 += 8 * m; @@ -179,8 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc3 += 4; psrc4 += 4; - ST_SP4(src0, src1, src2, src3, pdst2, 4); - pdst2 += 16; + ST_SP4_INC(src0, src1, src2, src3, pdst2, 4); } if (n & 2) @@ -215,10 +198,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - psrc1 += 8; - psrc2 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); ST_SP4(src0, src1, src2, src3, pdst1, 4); pdst1 += 8 * m; @@ -231,8 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 4; psrc2 += 4; - ST_SP2(src0, src1, pdst2, 4); - pdst2 += 8; + ST_SP2_INC(src0, src1, pdst2, 4); } if (n & 2) @@ -260,8 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - psrc1 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); ST_SP2(src0, src1, pdst1, 4); pdst1 += 8 * m; @@ -288,5 +267,5 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, } } - return 0; + return 0; } diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c index 516b9752f..53891e64f 100644 --- a/kernel/mips/strsm_kernel_LN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -166,7 +166,7 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); + src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); res_c4 *= src_a36; res_c12 *= src_a36; @@ -220,9 +220,9 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c0 -= res_c2 * src_a16; res_c8 -= res_c10 * src_a16; - COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); - COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); + src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c1 *= src_a9; res_c9 *= src_a9; @@ -306,7 +306,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); @@ -374,7 +374,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); + src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -399,9 +399,9 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c1 -= res_c2 * src_a17; res_c0 -= res_c2 * src_a16; - COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); - COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); + src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c1 *= src_a9; res_c0 -= res_c1 * src_a8; @@ -826,9 +826,9 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); - COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); + src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c3 *= src_a15; res_c7 *= src_a15; @@ -916,7 +916,7 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(aa); @@ -940,9 +940,9 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); - COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); + src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c3 *= src_a15; res_c2 -= res_c3 * src_a14; diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c index c087fdae5..5834d77b2 100644 --- a/kernel/mips/strsm_kernel_LT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -162,7 +162,7 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); + src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); res_c3 *= src_a27; res_c11 *= src_a27; @@ -216,9 +216,9 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c7 -= res_c5 * src_a47; res_c15 -= res_c13 * src_a47; - COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); - COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); - COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); + src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); + src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); + src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); res_c6 *= src_a54; res_c14 *= src_a54; @@ -334,7 +334,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); + src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); res_c3 *= src_a27; res_c4 -= res_c3 * src_a28; @@ -359,9 +359,9 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); - COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); - COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); + src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); + src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); + src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); res_c6 *= src_a54; res_c7 -= res_c6 * src_a55; @@ -780,7 +780,7 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 8; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); @@ -813,9 +813,9 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); - COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); - COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); + src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); + src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); + src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); res_c0 *= src_a0; res_c4 *= src_a0; @@ -902,7 +902,7 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); @@ -926,9 +926,9 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); - COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); - COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); + src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); + src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); + src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c index 69d7b5f72..642ee3757 100644 --- a/kernel/mips/strsm_kernel_RN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -144,7 +144,7 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); + src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); src_c4 *= src_b18; src_c5 *= src_b18; @@ -184,9 +184,9 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); - COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); - COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); + src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); + src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); + src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); src_c8 *= src_b36; src_c9 *= src_b36; @@ -275,7 +275,7 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(a, 4, src_a0, src_a1); @@ -300,9 +300,9 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); - COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); - COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); + src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); + src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); + src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -351,8 +351,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -364,8 +364,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -376,12 +376,12 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 2; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -392,9 +392,9 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 2; } - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -419,7 +419,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -429,7 +429,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -439,7 +439,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -449,7 +449,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -458,13 +458,13 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 1; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -474,7 +474,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -487,7 +487,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -497,7 +497,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -574,7 +574,7 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); + src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); src_b = LD_SP(b + 36); SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); @@ -584,9 +584,9 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); - COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); - COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); + src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); + src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); + src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -686,7 +686,7 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); @@ -707,9 +707,9 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); - COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); - COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); + src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); + src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); + src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -789,7 +789,7 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 2; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { @@ -831,9 +831,9 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c index eefd3a665..21e41c8fb 100644 --- a/kernel/mips/strsm_kernel_RT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -158,7 +158,7 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); + src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); src_c8 *= src_b36; src_c9 *= src_b36; @@ -203,9 +203,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); - COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); - COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); + src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c2 *= src_b9; src_c3 *= src_b9; @@ -273,7 +273,7 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); @@ -298,9 +298,9 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); - COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); + src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c7 *= src_b15; src_c6 *= src_b15; @@ -350,8 +350,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -363,8 +363,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -375,12 +375,12 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 2; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -391,9 +391,9 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c2 *= src_b3; src_c3 *= src_b3; @@ -419,7 +419,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -429,7 +429,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -439,7 +439,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -449,7 +449,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -458,13 +458,13 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) bb += 1; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -474,7 +474,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -487,7 +487,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -497,7 +497,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -579,7 +579,7 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); + src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); src_b = LD_SP(b + 24); SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); @@ -589,9 +589,9 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); - COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); - COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); + src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c7 *= src_b63; src_c6 -= src_c7 * src_b62; @@ -695,7 +695,7 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a = LD_SP(aa); @@ -717,9 +717,9 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); - COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); + src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c3 *= src_b15; src_c2 -= src_c3 * src_b14; @@ -800,7 +800,7 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 2; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { @@ -842,9 +842,9 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); - COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); + src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c1 *= src_b3; src_c0 -= src_c1 * src_b2; diff --git a/kernel/mips/zgemm_kernel_4x4_msa.c b/kernel/mips/zgemm_kernel_4x4_msa.c new file mode 100644 index 000000000..a185c69dd --- /dev/null +++ b/kernel/mips/zgemm_kernel_4x4_msa.c @@ -0,0 +1,1589 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = OP4 src_a1r * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_D2_DP(src_b2, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + res5_r OP0## = src_a1r * src_br; \ + res5_r OP1## = src_a1i * src_bi; \ + res5_i OP2## = OP4 src_a1r * src_bi; \ + res5_i OP3## = src_a1i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_D2_DP(src_b3, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ + \ + res7_r OP0## = src_a1r * src_br; \ + res7_r OP1## = src_a1i * src_bi; \ + res7_i OP2## = OP4 src_a1r * src_bi; \ + res7_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_D2_DP(src_b2, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_D2_DP(src_b3, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + src_a0 = LD_DP(pa0); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ + \ + /* 0th and 1st col */ \ + PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 2nd and 3rd col */ \ + PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \ + res1_r OP0## = src_a0r * src_br; \ + res1_r OP1## = src_a0i * src_bi; \ + res1_i OP2## = OP4 src_a0r * src_bi; \ + res1_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = OP4 src_a1r * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + src_a0 = LD_DP(pa0); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ + \ + /* 0th and 1st col */ \ + PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + src_b0 = LD_DP(pb0); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + src_b0 = LD_DP(pb0); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ +} + +#define ZGEMM_SCALE_4X4_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ + \ + LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r += alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i += alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r += alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i += alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ +} + +#define ZGEMM_SCALE_2X4_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + LD_DP2(pc1, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ + \ + LD_DP2(pc2, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + LD_DP2(pc3, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc2, 2); \ + ST_DP2_INC(dst2, dst3, pc3, 2); \ +} + +#define ZGEMM_SCALE_1X4_MSA \ +{ \ + dst0 = LD_DP(pc0); \ + dst1 = LD_DP(pc1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst2 = LD_DP(pc2); \ + dst3 = LD_DP(pc3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res1_r; \ + dst0_r -= alpha_i * res1_i; \ + dst0_i += alpha_r * res1_i; \ + dst0_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ + ST_DP(dst2, pc2); \ + ST_DP(dst3, pc3); \ +} + +#define ZGEMM_SCALE_4X2_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ +} + +#define ZGEMM_SCALE_2X2_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + \ + LD_DP2(pc1, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ +} + +#define ZGEMM_SCALE_1X2_MSA \ +{ \ + dst0 = LD_DP(pc0); \ + dst1 = LD_DP(pc1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ +} + +#define ZGEMM_SCALE_4X1_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ +} + +#define ZGEMM_SCALE_2X1_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ +} + +#define ZGEMM_SCALE_1X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +#define ZGEMM_TRMM_SCALE_4X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r = alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i = alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r = alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i = alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc2, 2); \ + ST_DP2_INC(dst2, dst3, pc3, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res1_r; \ + dst0_r -= alpha_i * res1_i; \ + dst0_i = alpha_r * res1_i; \ + dst0_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ + ST_DP(dst2, pc2); \ + ST_DP(dst3, pc3); \ +} + +#define ZGEMM_TRMM_SCALE_4X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ +} + +#define ZGEMM_TRMM_SCALE_4X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, + FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; + FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3; + v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i; + v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; + v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; + + alpha_r = COPY_DOUBLE_TO_VECTOR(alphar); + alpha_i = COPY_DOUBLE_TO_VECTOR(alphai); + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + pc2 = pc1 + 2 * ldc; + pc3 = pc2 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X4_MSA +#else + ZGEMM_SCALE_4X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X4_MSA +#else + ZGEMM_SCALE_2X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X4_MSA(, -, , -, -); +#endif + + pa0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X4_MSA(+, -, -, -,); +#endif + + pa0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X4_MSA +#else + ZGEMM_SCALE_1X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = k << 3; + B = B + l; + i = ldc << 3; + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X2_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X2_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X2_MSA +#else + ZGEMM_SCALE_4X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X2_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X2_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X2_MSA +#else + ZGEMM_SCALE_2X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X2_MSA(, -, , -, -); +#endif + + pa0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X2_MSA(+, -, -, -,); +#endif + + pa0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X2_MSA +#else + ZGEMM_SCALE_1X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + l = k << 2; + B = B + l; + i = ldc << 2; + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X1_MSA +#else + ZGEMM_SCALE_4X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X1_MSA +#else + ZGEMM_SCALE_2X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X1(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X1(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X1 +#else + ZGEMM_SCALE_1X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + + l = k << 1; + B = B + l; + i = ldc << 1; + C = C + i; + } + return 0; +} diff --git a/kernel/mips/zgemm_ncopy_4_msa.c b/kernel/mips/zgemm_ncopy_4_msa.c new file mode 100644 index 000000000..3ef46a571 --- /dev/null +++ b/kernel/mips/zgemm_ncopy_4_msa.c @@ -0,0 +1,144 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); + ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15, + pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src4, src5); + LD_DP2_INC(psrc3, 2, src8, src9); + LD_DP2_INC(psrc4, 2, src12, src13); + + ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + src4 = LD_DP(psrc2); + src8 = LD_DP(psrc3); + src12 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src4, src8, src12, pdst, 2); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + + ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src4, src5); + + ST_DP4_INC(src0, src4, src1, src5, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + src4 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2_INC(src0, src4, pdst, 2); + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + ST_DP4_INC(src0, src1, src2, src3, pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + ST_DP2_INC(src0, src1, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + ST_DP(src0, pdst); + } + } + + return 0; +} diff --git a/kernel/mips/zgemm_tcopy_4_msa.c b/kernel/mips/zgemm_tcopy_4_msa.c new file mode 100644 index 000000000..70314cb21 --- /dev/null +++ b/kernel/mips/zgemm_tcopy_4_msa.c @@ -0,0 +1,161 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + pdst2 = dst + 2 * m * (n & ~3); + pdst3 = dst + 2 * m * (n & ~1); + + for (j = (m >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + + ST_DP2_INC(src0, src1, pdst3, 2); + + psrc1 += 2; + psrc2 += 2; + } + } + + if (m & 1) + { + psrc1 = psrc0; + pdst1 = pdst0; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + ST_DP4(src0, src1, src2, src3, pdst1, 2); + + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + ST_DP2_INC(src0, src1, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + ST_DP(src0, pdst3); + } + } + + return 0; +} diff --git a/param.h b/param.h index fdc9d1104..dd58744d4 100644 --- a/param.h +++ b/param.h @@ -2188,11 +2188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 - -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 @@ -2227,11 +2227,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128