From c6996a80e950535306e281f7f2da54f387805d4e Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Fri, 8 Dec 2023 16:06:17 +0800 Subject: [PATCH 01/21] loongarch64: Refine amax,amin,max,min optimization. --- common_loongarch64.h | 24 +++ kernel/loongarch64/KERNEL.LOONGSON2K1000 | 16 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 16 +- kernel/loongarch64/amax_lasx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/amax_lsx.S | 231 ++++++++++++++++++++++ kernel/loongarch64/amin_lasx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/amin_lsx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/damax_lasx.S | 183 ------------------ kernel/loongarch64/damax_lsx.S | 145 -------------- kernel/loongarch64/damin_lasx.S | 178 ----------------- kernel/loongarch64/damin_lsx.S | 145 -------------- kernel/loongarch64/dmax_lasx.S | 175 ----------------- kernel/loongarch64/dmax_lsx.S | 141 -------------- kernel/loongarch64/dmin_lasx.S | 175 ----------------- kernel/loongarch64/dmin_lsx.S | 143 -------------- kernel/loongarch64/max_lasx.S | 229 ++++++++++++++++++++++ kernel/loongarch64/max_lsx.S | 228 ++++++++++++++++++++++ kernel/loongarch64/min_lasx.S | 229 ++++++++++++++++++++++ kernel/loongarch64/min_lsx.S | 228 ++++++++++++++++++++++ kernel/loongarch64/samax_lasx.S | 208 -------------------- kernel/loongarch64/samax_lsx.S | 177 ----------------- kernel/loongarch64/samin_lasx.S | 208 -------------------- kernel/loongarch64/samin_lsx.S | 177 ----------------- kernel/loongarch64/smax_lasx.S | 205 -------------------- kernel/loongarch64/smax_lsx.S | 171 ----------------- kernel/loongarch64/smin_lasx.S | 205 -------------------- kernel/loongarch64/smin_lsx.S | 174 ----------------- 27 files changed, 1881 insertions(+), 2826 deletions(-) create mode 100644 kernel/loongarch64/amax_lasx.S create mode 100644 kernel/loongarch64/amax_lsx.S create mode 100644 kernel/loongarch64/amin_lasx.S create mode 100644 kernel/loongarch64/amin_lsx.S delete mode 100644 kernel/loongarch64/damax_lasx.S delete mode 100644 kernel/loongarch64/damax_lsx.S delete mode 100644 kernel/loongarch64/damin_lasx.S delete mode 100644 kernel/loongarch64/damin_lsx.S delete mode 100644 kernel/loongarch64/dmax_lasx.S delete mode 100644 kernel/loongarch64/dmax_lsx.S delete mode 100644 kernel/loongarch64/dmin_lasx.S delete mode 100644 kernel/loongarch64/dmin_lsx.S create mode 100644 kernel/loongarch64/max_lasx.S create mode 100644 kernel/loongarch64/max_lsx.S create mode 100644 kernel/loongarch64/min_lasx.S create mode 100644 kernel/loongarch64/min_lsx.S delete mode 100644 kernel/loongarch64/samax_lasx.S delete mode 100644 kernel/loongarch64/samax_lsx.S delete mode 100644 kernel/loongarch64/samin_lasx.S delete mode 100644 kernel/loongarch64/samin_lsx.S delete mode 100644 kernel/loongarch64/smax_lasx.S delete mode 100644 kernel/loongarch64/smax_lsx.S delete mode 100644 kernel/loongarch64/smin_lasx.S delete mode 100644 kernel/loongarch64/smin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 4963b2f07..72e900f77 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -120,6 +120,10 @@ static inline int WhereAmI(void){ #define CMOVT fsel #define MTC movgr2fr.d #define FABS fabs.d +#define FMIN fmin.d +#define FMINA fmina.d +#define FMAX fmax.d +#define FMAXA fmaxa.d #define CMPEQ fcmp.ceq.d #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d @@ -128,10 +132,18 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.d #define XVFADD xvfadd.d #define XVFMADD xvfmadd.d +#define XVFMIN xvfmin.d +#define XVFMINA xvfmina.d +#define XVFMAX xvfmax.d +#define XVFMAXA xvfmaxa.d #define VFSUB vfsub.d #define VFADD vfadd.d #define VFMADD vfmadd.d +#define VFMIN vfmin.d +#define VFMINA vfmina.d +#define VFMAX vfmax.d +#define VFMAXA vfmaxa.d #else @@ -148,6 +160,10 @@ static inline int WhereAmI(void){ #define CMOVT fsel #define MTC movgr2fr.w #define FABS fabs.s +#define FMIN fmin.s +#define FMINA fmina.s +#define FMAX fmax.s +#define FMAXA fmaxa.s #define CMPEQ fcmp.ceq.s #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s @@ -156,10 +172,18 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.s #define XVFADD xvfadd.s #define XVFMADD xvfmadd.s +#define XVFMIN xvfmin.s +#define XVFMINA xvfmina.s +#define XVFMAX xvfmax.s +#define XVFMAXA xvfmaxa.s #define VFSUB vfsub.s #define VFADD vfadd.s #define VFMADD vfmadd.s +#define VFMIN vfmin.s +#define VFMINA vfmina.s +#define VFMAX vfmax.s +#define VFMAXA vfmaxa.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 1e4fa7a9d..802dd1c9b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -7,17 +7,17 @@ DDOTKERNEL = dot_lsx.S SSCALKERNEL = sscal_lsx.S DSCALKERNEL = dscal_lsx.S -SAMAXKERNEL = samax_lsx.S -DAMAXKERNEL = damax_lsx.S +SAMAXKERNEL = amax_lsx.S +DAMAXKERNEL = amax_lsx.S -SAMINKERNEL = samin_lsx.S -DAMINKERNEL = damin_lsx.S +SAMINKERNEL = amin_lsx.S +DAMINKERNEL = amin_lsx.S -SMAXKERNEL = smax_lsx.S -DMAXKERNEL = dmax_lsx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lsx.S -DMINKERNEL = dmin_lsx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S ISMAXKERNEL = ismax_lsx.S IDMAXKERNEL = idmax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f00abcb32..3253489d9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -7,17 +7,17 @@ DDOTKERNEL = dot_lasx.S SSCALKERNEL = sscal_lasx.S DSCALKERNEL = dscal_lasx.S -SAMAXKERNEL = samax_lasx.S -DAMAXKERNEL = damax_lasx.S +SAMAXKERNEL = amax_lasx.S +DAMAXKERNEL = amax_lasx.S -SAMINKERNEL = samin_lasx.S -DAMINKERNEL = damin_lasx.S +SAMINKERNEL = amin_lasx.S +DAMINKERNEL = amin_lasx.S -SMAXKERNEL = smax_lasx.S -DMAXKERNEL = dmax_lasx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lasx.S -DMINKERNEL = dmin_lasx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S ISMAXKERNEL = ismax_lasx.S IDMAXKERNEL = idmax_lasx.S diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S new file mode 100644 index 000000000..e964d4ddb --- /dev/null +++ b/kernel/loongarch64/amax_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM0, VM1 + XVFMAXA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S new file mode 100644 index 000000000..fb3b77a0e --- /dev/null +++ b/kernel/loongarch64/amax_lsx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM0, VM1 + VFMAXA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAXA VM1, VX0, VX1 + VFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S new file mode 100644 index 000000000..0a4359002 --- /dev/null +++ b/kernel/loongarch64/amin_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM0, VM1 + XVFMINA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMINA VM1, VX0, VX1 + XVFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S new file mode 100644 index 000000000..644caf43c --- /dev/null +++ b/kernel/loongarch64/amin_lsx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM0, VM1 + VFMINA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMINA VM1, VX0, VX1 + VFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + SUB $f0, $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S deleted file mode 100644 index c44ce4995..000000000 --- a/kernel/loongarch64/damax_lasx.S +++ /dev/null @@ -1,183 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define J $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define TEMP $r16 -#define m0 $xr8 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define x5 $xr13 -#define x6 $xr14 -#define x7 $xr15 -#define x8 $xr16 -#define VX0 $xr20 -#define VX1 $xr21 -#define VM0 $xr22 -#define VM1 $xr23 -#define VM2 $xr18 -#define VM3 $xr19 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - srai.d I, N, 3 - bge $r0, I, .L12 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - addi.d I, I, -1 - xvfmaxa.d VM1, VX1, VX0 - addi.d X, X, 8 * SIZE - xvfmaxa.d VM0, VM0, VM1 - blt $r0, I, .L10 - .align 3 - -.L11: - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfmaxa.d VM2, x3, x4 - xvfmaxa.d VM0, VM1, VM2 - .align 3 - -.L12: //INCX==1 and N<8 - andi I, N, 7 - li.d J, 4 - bge J, I, .L13 // 4 Date: Wed, 27 Dec 2023 11:30:17 +0800 Subject: [PATCH 02/21] loongarch64: Refine copy,swap,nrm2,sum optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 12 +- kernel/loongarch64/copy_lasx.S | 306 ++++++++++++++++ kernel/loongarch64/copy_lsx.S | 316 +++++++++++++++++ kernel/loongarch64/dcopy_lasx.S | 224 ------------ kernel/loongarch64/dcopy_lsx.S | 232 ------------ kernel/loongarch64/dnrm2_lasx.S | 124 ++++--- kernel/loongarch64/dnrm2_lsx.S | 84 +++-- kernel/loongarch64/dsum_lasx.S | 125 ------- kernel/loongarch64/dsum_lsx.S | 123 ------- kernel/loongarch64/dswap_lasx.S | 301 ---------------- kernel/loongarch64/dswap_lsx.S | 317 ----------------- kernel/loongarch64/scopy_lasx.S | 216 ------------ kernel/loongarch64/scopy_lsx.S | 220 ------------ kernel/loongarch64/snrm2_lasx.S | 116 +++--- kernel/loongarch64/snrm2_lsx.S | 141 ++++---- kernel/loongarch64/ssum_lasx.S | 140 -------- kernel/loongarch64/ssum_lsx.S | 125 ------- kernel/loongarch64/sswap_lasx.S | 286 --------------- kernel/loongarch64/sswap_lsx.S | 294 ---------------- kernel/loongarch64/sum_lasx.S | 225 ++++++++++++ kernel/loongarch64/sum_lsx.S | 204 +++++++++++ kernel/loongarch64/swap_lasx.S | 401 +++++++++++++++++++++ kernel/loongarch64/swap_lsx.S | 431 +++++++++++++++++++++++ 24 files changed, 2159 insertions(+), 2816 deletions(-) create mode 100644 kernel/loongarch64/copy_lasx.S create mode 100644 kernel/loongarch64/copy_lsx.S delete mode 100644 kernel/loongarch64/dcopy_lasx.S delete mode 100644 kernel/loongarch64/dcopy_lsx.S delete mode 100644 kernel/loongarch64/dsum_lasx.S delete mode 100644 kernel/loongarch64/dsum_lsx.S delete mode 100644 kernel/loongarch64/dswap_lasx.S delete mode 100644 kernel/loongarch64/dswap_lsx.S delete mode 100644 kernel/loongarch64/scopy_lasx.S delete mode 100644 kernel/loongarch64/scopy_lsx.S delete mode 100644 kernel/loongarch64/ssum_lasx.S delete mode 100644 kernel/loongarch64/ssum_lsx.S delete mode 100644 kernel/loongarch64/sswap_lasx.S delete mode 100644 kernel/loongarch64/sswap_lsx.S create mode 100644 kernel/loongarch64/sum_lasx.S create mode 100644 kernel/loongarch64/sum_lsx.S create mode 100644 kernel/loongarch64/swap_lasx.S create mode 100644 kernel/loongarch64/swap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 802dd1c9b..cb230b348 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lsx.S ISAMINKERNEL = isamin_lsx.S IDAMINKERNEL = idamin_lsx.S -SCOPYKERNEL = scopy_lsx.S -DCOPYKERNEL = dcopy_lsx.S +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S -SSWAPKERNEL = sswap_lsx.S -DSWAPKERNEL = dswap_lsx.S +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S SAXPYKERNEL = saxpy_lsx.S DAXPYKERNEL = daxpy_lsx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S -SSUMKERNEL = ssum_lsx.S -DSUMKERNEL = dsum_lsx.S +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3253489d9..ba59c4566 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lasx.S ISAMINKERNEL = isamin_lasx.S IDAMINKERNEL = idamin_lasx.S -SCOPYKERNEL = scopy_lasx.S -DCOPYKERNEL = dcopy_lasx.S +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S -SSWAPKERNEL = sswap_lasx.S -DSWAPKERNEL = dswap_lasx.S +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S SAXPYKERNEL = saxpy_lasx.S DAXPYKERNEL = daxpy_lasx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S -SSUMKERNEL = ssum_lasx.S -DSUMKERNEL = dsum_lasx.S +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S deleted file mode 100644 index 9d7da4a80..000000000 --- a/kernel/loongarch64/dcopy_lasx.S +++ /dev/null @@ -1,224 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvst VX0, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S deleted file mode 100644 index 161655bbd..000000000 --- a/kernel/loongarch64/dcopy_lsx.S +++ /dev/null @@ -1,232 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - addi.d I, I, -1 - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 0 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 2 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 4 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S index 2a9c3cf7b..5a6f7cf1e 100644 --- a/kernel/loongarch64/dnrm2_lasx.S +++ b/kernel/loongarch64/dnrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VM0 $xr17 @@ -35,6 +69,7 @@ xvxor.v res1, res1, res1 xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,12 +81,11 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - xvld VM0, X, 0 bge $r0, I, .L97 .align 3 .L10: - xvld VX0, X, 0 * SIZE + xvld VX0, X, 0 xvld VX1, X, 4 * SIZE xvfmaxa.d VM1, VX1, VX0 xvfmaxa.d VM0, VM0, VM1 @@ -62,40 +96,32 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 .align 3 .L21: - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t4, 3 xvfmaxa.d VM1, VX0, VX1 @@ -109,9 +135,9 @@ xvpickve.d VX0, VM0, 1 xvpickve.d VX1, VM0, 2 xvpickve.d VM3, VM0, 3 - xvfmaxa.d VM1, VX0, VX1 - xvfmaxa.d VM2, VM3, VM0 - xvfmaxa.d VM0, VM1, VM2 + fmaxa.d $f17, $f17, $f14 + fmaxa.d $f17, $f17, $f15 + fmaxa.d $f17, $f17, $f16 .align 3 .L97: @@ -149,12 +175,12 @@ .align 3 .L110: - xvld VX0, XX, 0 * SIZE + xvld VX0, XX, 0 xvld VX1, XX, 4 * SIZE - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -166,34 +192,34 @@ bge $r0, I, .L997 .L121: - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX xvinsgr2vr.d VX0, t1, 0 xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 xvinsgr2vr.d VX1, t3, 2 xvinsgr2vr.d VX1, t4, 3 - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -203,10 +229,10 @@ xvfadd.d res1, res1, res2 xvpickve.d VX0, res1, 1 xvpickve.d VX1, res1, 2 - xvpickve.d VM0, res1, 3 - xvfadd.d res1, VX0, res1 - xvfadd.d VX1, VX1, VM0 - xvfadd.d res1, VX1, res1 + xvpickve.d VM2, res1, 3 + fadd.d $f19, $f19, $f15 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f13 .align 3 .L997: @@ -215,19 +241,17 @@ .align 3 .L998: - fld.d $f15, XX, 0 * SIZE + fld.d $f15, XX, 0 addi.d I, I, -1 fmul.d $f15, $f15, ALPHA fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 - .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 + .align 3 EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S index e4615e18d..fce4260e2 100644 --- a/kernel/loongarch64/dnrm2_lsx.S +++ b/kernel/loongarch64/dnrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $vr15 #define VX1 $vr16 #define VM0 $vr17 @@ -35,6 +69,7 @@ vxor.v res1, res1, res1 vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,7 +81,7 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - vld VM0, X, 0 + bge $r0, I, .L97 .align 3 @@ -66,15 +101,7 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 .align 3 .L21: @@ -154,16 +181,16 @@ .L110: vld VX0, XX, 0 * SIZE vld VX1, XX, 2 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 vld VX0, XX, 4 * SIZE vld VX1, XX, 6 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -173,6 +200,7 @@ .L120: srai.d I, N, 3 bge $r0, I, .L997 + .align 3 .L121: ld.d t1, XX, 0 * SIZE @@ -187,14 +215,14 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA + vfmul.d VM2, VX0, VALPHA ld.d t1, XX, 0 * SIZE add.d XX, XX, INCX - vfmul.d VM1, VX1, VALPHA + vfmul.d VM3, VX1, VALPHA ld.d t2, XX, 0 * SIZE add.d XX, XX, INCX - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 ld.d t3, XX, 0 * SIZE add.d XX, XX, INCX ld.d t4, XX, 0 * SIZE @@ -203,10 +231,10 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -230,13 +258,11 @@ fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S deleted file mode 100644 index 3c51dab60..000000000 --- a/kernel/loongarch64/dsum_lasx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fadd.d $f16, $f12, $f16 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S deleted file mode 100644 index 402d087df..000000000 --- a/kernel/loongarch64/dsum_lsx.S +++ /dev/null @@ -1,123 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S deleted file mode 100644 index 221cb7fa2..000000000 --- a/kernel/loongarch64/dswap_lasx.S +++ /dev/null @@ -1,301 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvld VX2, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 3 - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 3 - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - xvstelm.d VX2, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX2, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX2, X, 0, 3 - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvst VX0, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX3, X, 0, 0 - add.d X, X, INCY - ld.d t2, X, 0 * SIZE - xvstelm.d VX3, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX3, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX3, X, 0, 3 - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S deleted file mode 100644 index 7f7f585e1..000000000 --- a/kernel/loongarch64/dswap_lsx.S +++ /dev/null @@ -1,317 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX2, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - addi.d I, I, -1 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 4 * SIZE - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 2 * SIZE - vld VX2, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S deleted file mode 100644 index 7db1e7cee..000000000 --- a/kernel/loongarch64/scopy_lasx.S +++ /dev/null @@ -1,216 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 7 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvst VX0, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S deleted file mode 100644 index 32150d3d6..000000000 --- a/kernel/loongarch64/scopy_lsx.S +++ /dev/null @@ -1,220 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vst VX0, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S index 274908c14..3ae11e897 100644 --- a/kernel/loongarch64/snrm2_lasx.S +++ b/kernel/loongarch64/snrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -11,10 +43,13 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VX2 $xr17 #define VX3 $xr18 +#define VX4 $xr21 #define res1 $xr19 #define res2 $xr20 @@ -37,14 +72,13 @@ .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 0 * SIZE - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvld VX0, X, 0 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE blt $r0, I, .L10 .align 3 b .L996 @@ -54,70 +88,46 @@ .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX xvinsgr2vr.w VX0, t1, 0 xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX + ld.w t4, X, 0 xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 0 - xvinsgr2vr.w VX1, t2, 1 - xvinsgr2vr.w VX1, t3, 2 - xvinsgr2vr.w VX1, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 4 - xvinsgr2vr.w VX1, t2, 5 - xvinsgr2vr.w VX1, t3, 6 - xvinsgr2vr.w VX1, t4, 7 - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 .L996: xvfadd.d res1, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f17 + fadd.d $f19, $f19, $f18 .align 3 .L997: @@ -126,11 +136,11 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE - addi.d I, I, -1 + fld.s $f15, X, 0 + add.d X, X, INCX + addi.d I, I, -1 fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 - add.d X, X, INCX + fmadd.d $f19, $f15, $f15, $f19 blt $r0, I, .L998 .align 3 diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S index 17d017900..bb492dbf0 100644 --- a/kernel/loongarch64/snrm2_lsx.S +++ b/kernel/loongarch64/snrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -15,6 +47,9 @@ #define VX1 $vr16 #define VX2 $vr17 #define VX3 $vr18 +#define VX4 $vr21 +#define VX5 $vr22 +/* Don't change following FR unless you know the effects. */ #define res1 $vr19 #define res2 $vr20 @@ -24,99 +59,71 @@ LDINT N, 0(N) LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 vxor.v res2, res2, res2 - bge $r0, N, .L999 + bge $r0, N, .L999 beq $r0, INCX, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - bge $r0, I, .L997 + bge $r0, I, .L997 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 0 * SIZE - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - vld VX2, X, 4 * SIZE - vld VX3, X, 4 * SIZE - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vld VX0, X, 0 + vld VX5, X, 4 * SIZE addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfcvtl.d.s VX3, VX5 + vfcvth.d.s VX4, VX5 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 blt $r0, I, .L10 b .L996 .align 3 - .L20: bge $r0, I, .L997 .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX vinsgr2vr.w VX0, t1, 0 vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 @@ -124,12 +131,8 @@ .L996: vfadd.d res1, res1, res2 - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 .align 3 .L997: @@ -138,7 +141,7 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE + fld.s $f15, X, 0 addi.d I, I, -1 fcvt.d.s $f15, $f15 fmadd.d $f19, $f15, $f15, $f19 diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S deleted file mode 100644 index 7cf57bc77..000000000 --- a/kernel/loongarch64/ssum_lasx.S +++ /dev/null @@ -1,140 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvfadd.s res1, VX0, res1 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfadd.s res1, VX0, res1 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S deleted file mode 100644 index de63c69e3..000000000 --- a/kernel/loongarch64/ssum_lsx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S deleted file mode 100644 index 7184eff45..000000000 --- a/kernel/loongarch64/sswap_lasx.S +++ /dev/null @@ -1,286 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 3 - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 7 - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 3 - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 4 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 5 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 6 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 7 - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S deleted file mode 100644 index 4f19a8024..000000000 --- a/kernel/loongarch64/sswap_lsx.S +++ /dev/null @@ -1,294 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - addi.d I, I, -1 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 3 - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 3 - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - vstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - vstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX2, X, 0, 3 - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vst VX0, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX3, X, 0, 0 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - vstelm.w VX3, X, 0, 1 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - vstelm.w VX3, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX3, X, 0, 3 - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S new file mode 100644 index 000000000..fd6d5adb3 --- /dev/null +++ b/kernel/loongarch64/sum_lasx.S @@ -0,0 +1,225 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 + xvfadd.s res1, res1, VX0 +#ifdef DOUBLE + xvld VX1, X, 32 + xvfadd.s res1, res1, VX1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S new file mode 100644 index 000000000..6b2027781 --- /dev/null +++ b/kernel/loongarch64/sum_lsx.S @@ -0,0 +1,204 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 + vld VX1, X, 16 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, res1, VX1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S new file mode 100644 index 000000000..4767fffe3 --- /dev/null +++ b/kernel/loongarch64/swap_lasx.S @@ -0,0 +1,401 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + xvld VX2, Y, 0 + addi.d I, I, -1 + xvst VX2, X, 0 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvld VX2, Y, 32 + xvst VX2, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + ld.d t1, Y, 0 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX0, Y, 0, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX1, Y, 0, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX0, X, 0 + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 + ld.d t1, X, 0 + xvstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX2, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX2, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX2, X, 0, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 + xvstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX3, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX3, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX3, X, 0, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 4 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 5 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 6 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S new file mode 100644 index 000000000..736187f93 --- /dev/null +++ b/kernel/loongarch64/swap_lsx.S @@ -0,0 +1,431 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and incy==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, Y, 0 + vld VX3, Y, 16 + addi.d I, I, -1 + vst VX2, X, 0 + vst VX3, X, 16 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vld VX2, Y, 32 + vld VX3, Y, 48 + vst VX2, X, 32 + vst VX3, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: +#ifdef DOUBLE + fld.d $f12, X, 0 + fld.d $f14, Y, 0 + addi.d I, I, -1 + fst.d $f12, Y, 0 + fst.d $f14, X, 0 +#else + fld.s $f12, X, 0 + fld.s $f14, Y, 0 + addi.d I, I, -1 + fst.s $f12, Y, 0 + fst.s $f14, X, 0 +#endif + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 + ld.w t1, Y, 0 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX0, Y, 0, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX1, Y, 0, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE +#else + vld VX2, Y, 0 + ld.w t1, X, 0 + vstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX2, X, 0, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 + vstelm.w VX3, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX3, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX3, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX3, X, 0, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + ST $f12, Y, 0 * SIZE + ST $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 154baad454647fdd6d71e2c907285859718da22e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 27 Dec 2023 16:04:33 +0800 Subject: [PATCH 03/21] loongarch64: Refine iamin optimization. --- common_loongarch64.h | 10 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- .../{isamin_lasx.S => iamin_lasx.S} | 270 +++++++---- kernel/loongarch64/iamin_lsx.S | 446 ++++++++++++++++++ kernel/loongarch64/idamin_lasx.S | 275 ----------- kernel/loongarch64/idamin_lsx.S | 228 --------- kernel/loongarch64/isamin_lsx.S | 275 ----------- 8 files changed, 649 insertions(+), 863 deletions(-) rename kernel/loongarch64/{isamin_lasx.S => iamin_lasx.S} (54%) create mode 100644 kernel/loongarch64/iamin_lsx.S delete mode 100644 kernel/loongarch64/idamin_lasx.S delete mode 100644 kernel/loongarch64/idamin_lsx.S delete mode 100644 kernel/loongarch64/isamin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 72e900f77..846fc0dbd 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -119,6 +119,7 @@ static inline int WhereAmI(void){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d #define FMIN fmin.d #define FMINA fmina.d @@ -136,6 +137,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.d #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLT xvfcmp.clt.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -144,6 +147,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.d #define VFMAX vfmax.d #define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLT vfcmp.clt.d #else @@ -159,6 +164,7 @@ static inline int WhereAmI(void){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s #define FMIN fmin.s #define FMINA fmina.s @@ -176,6 +182,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.s #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLT xvfcmp.clt.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -184,6 +192,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.s #define VFMAX vfmax.s #define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLT vfcmp.clt.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index cb230b348..4eae2e4f9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S -ISAMINKERNEL = isamin_lsx.S -IDAMINKERNEL = idamin_lsx.S +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index ba59c4566..e7e1b5d5a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S -ISAMINKERNEL = isamin_lasx.S -IDAMINKERNEL = idamin_lasx.S +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/iamin_lasx.S similarity index 54% rename from kernel/loongarch64/isamin_lasx.S rename to kernel/loongarch64/iamin_lasx.S index cbdf32530..6ea117907 100644 --- a/kernel/loongarch64/isamin_lasx.S +++ b/kernel/loongarch64/iamin_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,25 +128,49 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 addi.d I, I, -1 - xvadd.w VI1, VI1, VINC8 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 +#else + addi.d I, I, -1 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 +#endif + XVCMPEQ VT0, VM0, VM1 addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 +#else xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 + xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 + xvor.v VX1, VM0, VX1 xvpickve.w VI1, VI0, 0 xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 @@ -105,28 +181,62 @@ xvpickve.w x4, VM0, 3 xvfmina.s VM1, x1, x2 xvfcmp.ceq.s VT0, x1, VM1 +#endif xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMINA VM0, x4, x3 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMINA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX xvinsgr2vr.w VM0, t1, 0 srai.d I, N, 3 @@ -186,9 +296,43 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -213,73 +357,42 @@ xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - xvadd.w VI1, VI1, VINC8 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 xvfcmp.ceq.s VT0, VM1, VM0 +#endif addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L24 .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfmina.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else fmov.s $f16, $f20 .align 3 @@ -306,35 +419,28 @@ xvfmina.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -346,9 +452,11 @@ xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 +#endif + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 srai.d i1, N, 3 slli.d i1, i1, 3 addi.d i1, i1, 1 //current index @@ -357,17 +465,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmina.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX + XVFMINA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +483,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S new file mode 100644 index 000000000..ce885fd88 --- /dev/null +++ b/kernel/loongarch64/iamin_lsx.S @@ -0,0 +1,446 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#ifdef DOUBLE +#define VINC2 $vr17 +#define VINC4 $vr18 +#else +#define VINC4 $vr17 +#define VINC8 $vr18 +#endif +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 +#else + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v x2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 +#endif + VCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vreplvei.d VI1, VI0, 0 + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 +#endif + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + VFMINA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + MTC $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S deleted file mode 100644 index 6ef1e8903..000000000 --- a/kernel/loongarch64/idamin_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d I, I, -1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmina.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S deleted file mode 100644 index 9eb9d883f..000000000 --- a/kernel/loongarch64/idamin_lsx.S +++ /dev/null @@ -1,228 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmina.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - vfmina.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S deleted file mode 100644 index 598888660..000000000 --- a/kernel/loongarch64/isamin_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmina.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 8be26541933b36c6e3e8002c44002efb02033bdd Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 10:24:24 +0800 Subject: [PATCH 04/21] loongarch64: Refine imax optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/idmax_lasx.S | 273 ----------- kernel/loongarch64/idmax_lsx.S | 225 --------- .../loongarch64/{ismax_lasx.S => imax_lasx.S} | 230 ++++++++-- kernel/loongarch64/imax_lsx.S | 428 ++++++++++++++++++ kernel/loongarch64/ismax_lsx.S | 272 ----------- 7 files changed, 626 insertions(+), 810 deletions(-) delete mode 100644 kernel/loongarch64/idmax_lasx.S delete mode 100644 kernel/loongarch64/idmax_lsx.S rename kernel/loongarch64/{ismax_lasx.S => imax_lasx.S} (57%) create mode 100644 kernel/loongarch64/imax_lsx.S delete mode 100644 kernel/loongarch64/ismax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 4eae2e4f9..346f1fb45 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -19,8 +19,8 @@ DMAXKERNEL = max_lsx.S SMINKERNEL = min_lsx.S DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lsx.S -IDMAXKERNEL = idmax_lsx.S +ISMAXKERNEL = imax_lsx.S +IDMAXKERNEL = imax_lsx.S ISMINKERNEL = ismin_lsx.S IDMINKERNEL = idmin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e7e1b5d5a..6b4df2d61 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -19,8 +19,8 @@ DMAXKERNEL = max_lsx.S SMINKERNEL = min_lsx.S DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lasx.S -IDMAXKERNEL = idmax_lasx.S +ISMAXKERNEL = imax_lasx.S +IDMAXKERNEL = imax_lasx.S ISMINKERNEL = ismin_lasx.S IDMINKERNEL = idmin_lasx.S diff --git a/kernel/loongarch64/idmax_lasx.S b/kernel/loongarch64/idmax_lasx.S deleted file mode 100644 index bbfe0941a..000000000 --- a/kernel/loongarch64/idmax_lasx.S +++ /dev/null @@ -1,273 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f15, $f9 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmax_lsx.S b/kernel/loongarch64/idmax_lsx.S deleted file mode 100644 index 1b4734bab..000000000 --- a/kernel/loongarch64/idmax_lsx.S +++ /dev/null @@ -1,225 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x1, x3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, VM0, x1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x1, x3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, VM0, x1 - addi.d I, I, -1 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfcmp.clt.d VT0, x1, x2 - vbitsel.v VM0, x1, x2, VT0 - vbitsel.v VI0, VI1, VI2, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f15, $f9 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismax_lasx.S b/kernel/loongarch64/imax_lasx.S similarity index 57% rename from kernel/loongarch64/ismax_lasx.S rename to kernel/loongarch64/imax_lasx.S index 843dd6c6a..2d3d5e9d3 100644 --- a/kernel/loongarch64/ismax_lasx.S +++ b/kernel/loongarch64/imax_lasx.S @@ -1,3 +1,29 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ #define ASSEMBLER #include "common.h" @@ -39,6 +65,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,20 +127,47 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else xvadd.w VI1, VI1, VINC8 xvfcmp.clt.s VT0, VM0, VX0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -103,28 +181,33 @@ xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 xvfcmp.clt.s VT0, x1, x2 +#endif xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 + XVCMPLT VT0, x3, x4 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 + XVCMPLT VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 - .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -143,11 +226,38 @@ ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX ld.w t2, TEMP, 0 * SIZE +#endif add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE + ld.d t3, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE + ld.d t4, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX +#ifdef DOUBLE + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else xvinsgr2vr.w VM0, t1, 4 xvinsgr2vr.w VM0, t2, 5 xvinsgr2vr.w VM0, t3, 6 @@ -186,9 +296,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -218,10 +365,21 @@ addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -230,57 +388,56 @@ xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x1, x2 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 + XVCMPLT VT0, x3, x4 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 + XVCMPLT VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifndef DOUBLE .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -343,6 +500,7 @@ fsel $f15, $f15, $f13, $fcc0 fsel $f20, $f20, $f16, $fcc0 movfr2gr.s i0, $f20 +#endif .L21: //N<8 andi I, N, 7 @@ -357,14 +515,14 @@ .L22: fld.d $f9, X, 0 addi.d I, I, -1 - fcmp.clt.s $fcc0, $f15, $f9 + CMPLT $fcc0, $f15, $f9 add.d X, X, INCX fsel $f15, $f15, $f9, $fcc0 fsel $f20, $f20, $f21, $fcc0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -372,4 +530,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S new file mode 100644 index 000000000..92556d4e6 --- /dev/null +++ b/kernel/loongarch64/imax_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x1, x2 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ismax_lsx.S b/kernel/loongarch64/ismax_lsx.S deleted file mode 100644 index 33b326bbd..000000000 --- a/kernel/loongarch64/ismax_lsx.S +++ /dev/null @@ -1,272 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x1, x2 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x3, x4 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x1, x2 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x3, x4 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.s $fcc0, $f15, $f9 - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 116aee7527935a51b288a15b7feffc6ea2313e8a Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 15:17:28 +0800 Subject: [PATCH 05/21] loongarch64: Refine imin optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/idmin_lasx.S | 272 ----------- kernel/loongarch64/idmin_lsx.S | 225 --------- .../loongarch64/{ismin_lasx.S => imin_lasx.S} | 266 ++++++++--- kernel/loongarch64/imin_lsx.S | 428 ++++++++++++++++++ kernel/loongarch64/ismin_lsx.S | 271 ----------- 7 files changed, 645 insertions(+), 825 deletions(-) delete mode 100644 kernel/loongarch64/idmin_lasx.S delete mode 100644 kernel/loongarch64/idmin_lsx.S rename kernel/loongarch64/{ismin_lasx.S => imin_lasx.S} (54%) create mode 100644 kernel/loongarch64/imin_lsx.S delete mode 100644 kernel/loongarch64/ismin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 346f1fb45..9164f28ef 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -22,8 +22,8 @@ DMINKERNEL = min_lsx.S ISMAXKERNEL = imax_lsx.S IDMAXKERNEL = imax_lsx.S -ISMINKERNEL = ismin_lsx.S -IDMINKERNEL = idmin_lsx.S +ISMINKERNEL = imin_lsx.S +IDMINKERNEL = imin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 6b4df2d61..5882b8932 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -22,8 +22,8 @@ DMINKERNEL = min_lsx.S ISMAXKERNEL = imax_lasx.S IDMAXKERNEL = imax_lasx.S -ISMINKERNEL = ismin_lasx.S -IDMINKERNEL = idmin_lasx.S +ISMINKERNEL = imin_lasx.S +IDMINKERNEL = imin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S diff --git a/kernel/loongarch64/idmin_lasx.S b/kernel/loongarch64/idmin_lasx.S deleted file mode 100644 index 7930d4963..000000000 --- a/kernel/loongarch64/idmin_lasx.S +++ /dev/null @@ -1,272 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x2, x1 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x4, x3 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x2, x1 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x4, x3 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f9, $f15 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmin_lsx.S b/kernel/loongarch64/idmin_lsx.S deleted file mode 100644 index 8b6edcbf0..000000000 --- a/kernel/loongarch64/idmin_lsx.S +++ /dev/null @@ -1,225 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x3, x1 - addi.d X, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, x1, VM0 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x3, x1 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, x1, VM0 - addi.d I, I, -1 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfcmp.clt.d VT0, x2, x1 - vbitsel.v VM0, x1, x2, VT0 - vbitsel.v VI0, VI1, VI2, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f9, $f15 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismin_lasx.S b/kernel/loongarch64/imin_lasx.S similarity index 54% rename from kernel/loongarch64/ismin_lasx.S rename to kernel/loongarch64/imin_lasx.S index 15f6e2ec9..5306828e2 100644 --- a/kernel/loongarch64/ismin_lasx.S +++ b/kernel/loongarch64/imin_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,20 +128,45 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else xvadd.w VI1, VI1, VINC8 - xvfcmp.clt.s VT0, VX0, VM0 + XVCMPLT VT0, VX0, VM0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -98,32 +175,67 @@ xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x2, x1 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM1, VM0 + XVCMPLT VT0, VM1, VM0 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -185,9 +297,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -213,73 +362,83 @@ xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 xvadd.w VI1, VI1, VINC8 - xvfcmp.clt.s VT0, VX0, VM0 + XVCMPLT VT0, VX0, VM0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 + xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 + xvor.v VX1, VM0, VX1 xvpickve.w VI1, VI0, 0 xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x2, x1 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM1, VM0 + XVCMPLT VT0, VM1, VM0 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + MTG i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifndef DOUBLE .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -294,13 +453,13 @@ xvpickve.w x2, VM0, 5 xvpickve.w x3, VM0, 6 xvpickve.w x4, VM0, 7 - xvfcmp.clt.s VT0, x2, x1 + XVCMPLT VT0, x2, x1 xvbitsel.v x1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, x1, VM0 + XVCMPLT VT0, x1, VM0 xvbitsel.v VM0, VM0, x1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 li.d TEMP, 1 //处理尾数相等时取最小序号 @@ -309,7 +468,7 @@ xvfcmp.ceq.s VT0, VM0, x1 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 @@ -317,7 +476,7 @@ xvfcmp.ceq.s VT0, VM0, x2 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L272 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 @@ -325,7 +484,7 @@ xvfcmp.ceq.s VT0, VM0, x3 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L282 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 @@ -333,17 +492,18 @@ xvfcmp.ceq.s VT0, VM0, x4 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L292 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L292: - fcmp.clt.s $fcc0, $f13, $f15 + CMPLT $fcc0, $f13, $f15 fsel $f15, $f15, $f13, $fcc0 fsel $f20, $f20, $f16, $fcc0 - movfr2gr.s i0, $f20 + MTG i0, $f20 +#endif -.L21: //N<8 +.L21: //N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -356,14 +516,14 @@ .L22: fld.d $f9, X, 0 addi.d I, I, -1 - fcmp.clt.s $fcc0, $f9, $f15 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX fsel $f15, $f15, $f9, $fcc0 fsel $f20, $f20, $f21, $fcc0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 - add.d X, X, INCX blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -371,4 +531,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S new file mode 100644 index 000000000..a0c411e7a --- /dev/null +++ b/kernel/loongarch64/imin_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + addi.d X, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x2, x1 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ismin_lsx.S b/kernel/loongarch64/ismin_lsx.S deleted file mode 100644 index f90ebbd57..000000000 --- a/kernel/loongarch64/ismin_lsx.S +++ /dev/null @@ -1,271 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM1, VM0 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x2, x1 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x4, x3 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x2, x1 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x4, x3 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, x2, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, x3, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, x4, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - fcmp.clt.s $fcc0, $f9, $f15 - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d I, I, -1 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From ea70e165c71201e46961c479e53c17d4034290f8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 20:07:59 +0800 Subject: [PATCH 06/21] loongarch64: Refine rot optimization. --- common_loongarch64.h | 10 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/drot_lsx.S | 1050 ---------- .../loongarch64/{drot_lasx.S => rot_lasx.S} | 779 ++++++- kernel/loongarch64/rot_lsx.S | 1791 +++++++++++++++++ kernel/loongarch64/srot_lasx.S | 863 -------- kernel/loongarch64/srot_lsx.S | 927 --------- 8 files changed, 2532 insertions(+), 2896 deletions(-) delete mode 100644 kernel/loongarch64/drot_lsx.S rename kernel/loongarch64/{drot_lasx.S => rot_lasx.S} (52%) create mode 100644 kernel/loongarch64/rot_lsx.S delete mode 100644 kernel/loongarch64/srot_lasx.S delete mode 100644 kernel/loongarch64/srot_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 846fc0dbd..13514d6e0 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -129,6 +129,7 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d +#define FFINT ffint.d.l #define XVFSUB xvfsub.d #define XVFADD xvfadd.d @@ -139,6 +140,8 @@ static inline int WhereAmI(void){ #define XVFMAXA xvfmaxa.d #define XVCMPEQ xvfcmp.ceq.d #define XVCMPLT xvfcmp.clt.d +#define XVMUL xvfmul.d +#define XVMSUB xvfmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -149,6 +152,8 @@ static inline int WhereAmI(void){ #define VFMAXA vfmaxa.d #define VCMPEQ vfcmp.ceq.d #define VCMPLT vfcmp.clt.d +#define VMUL vfmul.d +#define VMSUB vfmsub.d #else @@ -174,6 +179,7 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s +#define FFINT ffint.s.l #define XVFSUB xvfsub.s #define XVFADD xvfadd.s @@ -184,6 +190,8 @@ static inline int WhereAmI(void){ #define XVFMAXA xvfmaxa.s #define XVCMPEQ xvfcmp.ceq.s #define XVCMPLT xvfcmp.clt.s +#define XVMUL xvfmul.s +#define XVMSUB xvfmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -194,6 +202,8 @@ static inline int WhereAmI(void){ #define VFMAXA vfmaxa.s #define VCMPEQ vfcmp.ceq.s #define VCMPLT vfcmp.clt.s +#define VMUL vfmul.s +#define VMSUB vfmsub.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 9164f28ef..b315c81f2 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,8 +49,8 @@ DSUMKERNEL = sum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S -SROTKERNEL = srot_lsx.S -DROTKERNEL = drot_lsx.S +SROTKERNEL = rot_lsx.S +DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 5882b8932..577f6316e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,8 +49,8 @@ DSUMKERNEL = sum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S -SROTKERNEL = srot_lasx.S -DROTKERNEL = drot_lasx.S +SROTKERNEL = rot_lasx.S +DROTKERNEL = rot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S diff --git a/kernel/loongarch64/drot_lsx.S b/kernel/loongarch64/drot_lsx.S deleted file mode 100644 index 6db803b1c..000000000 --- a/kernel/loongarch64/drot_lsx.S +++ /dev/null @@ -1,1050 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VT0 $vr10 -#define VT1 $vr18 -#define VXC $vr23 -#define VXS $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.d.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, C - vreplgr2vr.d VXC, t1 - movfr2gr.d t2, S - vreplgr2vr.d VXS, t2 - movfr2gr.d t3, a1 - vreplgr2vr.d VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE - vst VXZ, X, 2 * SIZE - vst VXZ, Y, 2 * SIZE - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE - vst VXZ, X, 6 * SIZE - vst VXZ, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, X, 4 * SIZE - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VXS, VX0 - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VXS, VX0 - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXS, VX2 - vfmul.d VT1, VXS, VX0 - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXS, VX2 - vfmul.d VT1, VXS, VX0 - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE - fmul.d $f10, $f12, C - fmadd.d $f10, $f13, S, $f10 - fst.d $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.d $f20, $f12, S - fmsub.d $f20, $f13, C, $f20 - fst.d $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/drot_lasx.S b/kernel/loongarch64/rot_lasx.S similarity index 52% rename from kernel/loongarch64/drot_lasx.S rename to kernel/loongarch64/rot_lasx.S index d3644b780..5d7e3d7cc 100644 --- a/kernel/loongarch64/drot_lasx.S +++ b/kernel/loongarch64/rot_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -33,16 +60,25 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE movfr2gr.d t1, C xvreplgr2vr.d VXC, t1 movfr2gr.d t2, S xvreplgr2vr.d VXS, t2 movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + xvreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + xvreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -53,15 +89,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L112 // C!=0 S==0 b .L111 // C!=0 S!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L114 // C==0 S==0 b .L113 // C==0 S!=0 .align 3 @@ -69,20 +105,24 @@ .L111: // C!=0 S!=0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -93,16 +133,20 @@ .L112: // C!=0 S==0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC +#endif + XVMUL VT0, VX0, VXC + XVMUL VT1, VX2, VXC xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVMUL VT1, VX3, VXC xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,18 +157,22 @@ .L113: // C==0 S!=0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 +#endif + XVMUL VT0, VX2, VXS + XVMUL VT1, VX0, VXS + XVFSUB VT1, VXZ, VT1 xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS +#ifdef DOUBLE + XVMUL VT0, VX3, VXS + XVMUL VT1, VX1, VXS xvfsub.d VT1, VXZ, VT1 xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -135,8 +183,10 @@ .L114: // C==0 S==0 xvst VXZ, X, 0 * SIZE xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, X, 4 * SIZE xvst VXZ, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -148,37 +198,66 @@ bge $r0, I, .L997 move YY, Y move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L122 // C!=0 S==0 b .L121 // C!=0 S!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L124 // C==0 S==0 b .L123 // C==0 S!=0 .align 3 .L121: // C!=0 S!=0 xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvst VT0, X, 0 * SIZE xvstelm.d VT1, YY, 0, 0 @@ -201,11 +280,10 @@ xvinsgr2vr.d VX3, t3, 2 xvinsgr2vr.d VX3, t4, 3 add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 - addi.d I, I, -1 + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY @@ -214,13 +292,34 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // C!=0 S==0 +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -269,13 +368,60 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // C==0 S!=0 +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -326,14 +472,63 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // C==0 S==0 xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 0 * SIZE xvst VXZ, X, 4 * SIZE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY @@ -350,29 +545,50 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvst VXZ, X, 0 * SIZE + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 + addi.d X, X, 8 * SIZE blt $r0, I, .L124 + move Y, YY b .L997 .align 3 .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L212 // C!=0 S==0 b .L211 // C!=0 S!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L214 // C==0 S==0 b .L213 // C==0 S!=0 .align 3 .L211: // C!=0 S!=0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -425,6 +641,54 @@ xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L211 @@ -432,6 +696,7 @@ .align 3 .L212: // C!=0 S==0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -480,6 +745,52 @@ add.d XX, XX, INCX xvfmul.d VT1, VX3, VXS xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L212 @@ -487,6 +798,7 @@ .align 3 .L213: // C==0 S!=0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -537,6 +849,53 @@ xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXS, VX2 + xvfmul.s VT1, VXS, VX0 + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L213 @@ -544,6 +903,7 @@ .align 3 .L214: // C==0 S==0 +#ifdef DOUBLE xvstelm.d VXZ, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VXZ, XX, 0, 1 @@ -562,6 +922,25 @@ xvstelm.d VXZ, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L211 @@ -572,20 +951,21 @@ bge $r0, I, .L997 move YY, Y move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L222 // C!=0 S==0 b .L221 // C!=0 S!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L224 // C==0 S==0 b .L223 // C==0 S!=0 .align 3 .L221: // C!=0 S!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -674,12 +1054,99 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L221 b .L997 .align 3 .L222: // C!=0 S==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -764,12 +1231,97 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L222 b .L997 .align 3 .L223: // C==0 S!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -856,12 +1408,98 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L223 b .L997 .align 3 .L224: // C==0 S==0 +#ifdef DOUBLE xvstelm.d VXZ, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VXZ, XX, 0, 1 @@ -893,9 +1531,46 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif b .L997 .align 3 @@ -905,15 +1580,15 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE - fmul.d $f10, $f12, C - fmadd.d $f10, $f13, S, $f10 - fst.d $f10, X, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE addi.d I, I, -1 - fmul.d $f20, $f12, S - fmsub.d $f20, $f13, C, $f20 - fst.d $f20, Y, 0 * SIZE + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 @@ -924,4 +1599,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S new file mode 100644 index 000000000..4b0e59310 --- /dev/null +++ b/kernel/loongarch64/rot_lsx.S @@ -0,0 +1,1791 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, C + vreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + vreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + vreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + vreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE +#endif + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE +#ifdef DOUBLE + vst VXZ, X, 6 * SIZE + vst VXZ, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE +#endif + add.d Y, Y, INCY +#ifndef DOUBLE + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L124 +#ifdef DOUBLE + move Y, YY +#endif + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 +#endif + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 +#ifdef DOUBLE + move X, XX +#endif + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#ifndef DOUBLE + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE + addi.d I, I, -1 + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/srot_lasx.S b/kernel/loongarch64/srot_lasx.S deleted file mode 100644 index 9aeb4dcf5..000000000 --- a/kernel/loongarch64/srot_lasx.S +++ /dev/null @@ -1,863 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VT0 $xr10 -#define VT1 $xr18 -#define VXC $xr23 -#define VXS $xr9 -#define VXZ $xr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, C - xvreplgr2vr.w VXC, t1 - movfr2gr.s t2, S - xvreplgr2vr.w VXS, t2 - movfr2gr.s t3, a1 - xvreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvst VXZ, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXS, VX2 - xvfmul.s VT1, VXS, VX0 - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - fmul.s $f10, $f12, C - fmadd.s $f10, $f13, S, $f10 - fst.s $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.s $f20, $f12, S - fmsub.s $f20, $f13, C, $f20 - fst.s $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/srot_lsx.S b/kernel/loongarch64/srot_lsx.S deleted file mode 100644 index 8822b58e4..000000000 --- a/kernel/loongarch64/srot_lsx.S +++ /dev/null @@ -1,927 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VT0 $vr10 -#define VT1 $vr18 -#define VXC $vr23 -#define VXS $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, C - vreplgr2vr.w VXC, t1 - movfr2gr.s t2, S - vreplgr2vr.w VXS, t2 - movfr2gr.s t3, a1 - vreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, X, 4 * SIZE - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXC, VX0 - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VXS, VX0 - vfmsub.s VT1, VX2, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXC, VX0 - vfmul.s VT1, VX2, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXS - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXS, VX2 - vfmul.s VT1, VXS, VX0 - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - fmul.s $f10, $f12, C - fmadd.s $f10, $f13, S, $f10 - fst.s $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.s $f20, $f12, S - fmsub.s $f20, $f13, C, $f20 - fst.s $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 173a65d4e6ddf8bc5a9b1cd02d594d4b48dc9f89 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 25 Dec 2023 15:11:04 +0800 Subject: [PATCH 07/21] loongarch64: Add and refine iamax optimization functions. --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 7 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 7 +- kernel/loongarch64/camax_lasx.S | 194 ++++++ kernel/loongarch64/camax_lsx.S | 206 +++++++ .../{isamax_lasx.S => iamax_lasx.S} | 254 ++++++-- kernel/loongarch64/iamax_lsx.S | 482 +++++++++++++++ kernel/loongarch64/icamax_lasx.S | 562 ++++++++++++++++++ kernel/loongarch64/icamax_lsx.S | 434 ++++++++++++++ kernel/loongarch64/idamax_lasx.S | 275 --------- kernel/loongarch64/idamax_lsx.S | 267 --------- kernel/loongarch64/isamax_lsx.S | 275 --------- 12 files changed, 2101 insertions(+), 866 deletions(-) create mode 100644 kernel/loongarch64/camax_lasx.S create mode 100644 kernel/loongarch64/camax_lsx.S rename kernel/loongarch64/{isamax_lasx.S => iamax_lasx.S} (55%) create mode 100644 kernel/loongarch64/iamax_lsx.S create mode 100644 kernel/loongarch64/icamax_lasx.S create mode 100644 kernel/loongarch64/icamax_lsx.S delete mode 100644 kernel/loongarch64/idamax_lasx.S delete mode 100644 kernel/loongarch64/idamax_lsx.S delete mode 100644 kernel/loongarch64/isamax_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 13514d6e0..599b4795c 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -139,6 +139,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d #define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d #define XVCMPLT xvfcmp.clt.d #define XVMUL xvfmul.d #define XVMSUB xvfmsub.d @@ -151,6 +152,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.d #define VFMAXA vfmaxa.d #define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d #define VCMPLT vfcmp.clt.d #define VMUL vfmul.d #define VMSUB vfmsub.d @@ -189,6 +191,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s #define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s #define XVCMPLT xvfcmp.clt.s #define XVMUL xvfmul.s #define XVMSUB xvfmsub.s @@ -201,6 +204,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.s #define VFMAXA vfmaxa.s #define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s #define VCMPLT vfcmp.clt.s #define VMUL vfmul.s #define VMSUB vfmsub.s diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index b315c81f2..a8a6dd82f 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lsx.S ISMINKERNEL = imin_lsx.S IDMINKERNEL = imin_lsx.S -ISAMAXKERNEL = isamax_lsx.S -IDAMAXKERNEL = idamax_lsx.S +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S ISAMINKERNEL = iamin_lsx.S IDAMINKERNEL = iamin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 577f6316e..e4c45e1fa 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lasx.S ISMINKERNEL = imin_lasx.S IDMINKERNEL = imin_lasx.S -ISAMAXKERNEL = isamax_lasx.S -IDAMAXKERNEL = idamax_lasx.S +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S ISAMINKERNEL = iamin_lasx.S IDAMINKERNEL = iamin_lasx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..7013430cb --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmax.s VM1, x1, x2 + xvfmax.s VM0, x3, x4 + xvfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..2e55629de --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmax.s VM1, x1, VM1 + vfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmax.s VM1, x1, x2 + vfmax.s VM0, x3, x4 + vfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/iamax_lasx.S similarity index 55% rename from kernel/loongarch64/isamax_lasx.S rename to kernel/loongarch64/iamax_lasx.S index 2800b1d43..090da3004 100644 --- a/kernel/loongarch64/isamax_lasx.S +++ b/kernel/loongarch64/iamax_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,9 +128,25 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else xvld VX0, X, 0 * SIZE addi.d I, I, -1 xvadd.w VI1, VI1, VINC8 @@ -87,10 +155,21 @@ addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -103,28 +182,62 @@ xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 +#endif + XVFMAXA VM1, x1, x2 + XVCMPEQ VT0, x1, VM1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMAXA VM0, x3, x4 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -186,9 +299,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -219,10 +369,30 @@ addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -245,44 +415,45 @@ xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 +#endif + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifdef DOUBLE + +#else .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -306,35 +477,27 @@ xvfmaxa.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 - .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -345,8 +508,9 @@ xvfcmp.ceq.s VT0, VM0, VX0 xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 +#endif -.L21: // N<8 +.L21: //N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -357,17 +521,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmaxa.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +539,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S new file mode 100644 index 000000000..ce5b3c724 --- /dev/null +++ b/kernel/loongarch64/iamax_lsx.S @@ -0,0 +1,482 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L17 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +#ifdef DOUBLE +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +#else +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + VFMAXA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S new file mode 100644 index 000000000..7800cb917 --- /dev/null +++ b/kernel/loongarch64/icamax_lasx.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + xvxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmax.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmax.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d I, I, -1 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.s x1, x1, x2 + xvfmax.s x3, VM0, x1 + xvfcmp.ceq.s VT0, x3, VM0 +#endif + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +#endif +.L21: //N<8 +#ifdef DOUBLE + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S new file mode 100644 index 000000000..a2fc9dbbd --- /dev/null +++ b/kernel/loongarch64/icamax_lsx.S @@ -0,0 +1,434 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + vxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 +#endif + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMAX x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + addi.d I, I, -1 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 + vfcmp.clt.s VT0, x1, VI3 + vfcmp.clt.s VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.s x1, x1, x2 + vfmax.s x3, VM0, x1 + vfcmp.ceq.s VT0, x3, VM0 +#endif + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +#ifdef DOUBLE +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 +#else +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S deleted file mode 100644 index 8248ee757..000000000 --- a/kernel/loongarch64/idamax_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmaxa.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S deleted file mode 100644 index fb2d5bac1..000000000 --- a/kernel/loongarch64/idamax_lsx.S +++ /dev/null @@ -1,267 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L11 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L16 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L17 - .align 3 - -.L16: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L17: - movfr2gr.d i0, $f20 - .align 3 - -.L11: //INCX==1 and N<8 - andi I, N, 7 - bge $r0, I, .L14 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L13: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - addi.d X, X, SIZE - movgr2fr.d $f21, i1 - blt $r0, I, .L13 - movfr2gr.d i0, $f20 - .align 3 - -.L14: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S deleted file mode 100644 index a18aa7354..000000000 --- a/kernel/loongarch64/isamax_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmaxa.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 179ed51d3b2df5e0df8a28d184bd169efa7f2b61 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 21 Dec 2023 14:18:39 +0800 Subject: [PATCH 08/21] Add dgemm_kernel_8x4.S file. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 14 + kernel/loongarch64/dgemm_kernel_8x4.S | 2894 ++++++++++++++++++++++ param.h | 4 +- 3 files changed, 2910 insertions(+), 2 deletions(-) create mode 100644 kernel/loongarch64/dgemm_kernel_8x4.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a8a6dd82f..a78c0dbc5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -58,4 +58,18 @@ DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +DGEMMKERNEL = dgemm_kernel_8x4.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S new file mode 100644 index 000000000..405f1bd97 --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_8x4.S @@ -0,0 +1,2894 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r26 +#define T2 $r27 +#define ZERO $r0 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define D14 $vr30 +#define D15 $vr31 +#define VALPHA $vr15 + +/* Prefetch interval */ +#define A_PRE 0x200 +#define B_PRE 0x100 + +.macro KERNEL2x8x4 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vldrepl.d U13, B0, 0x08 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vldrepl.d U14, B0, 0x10 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U15, B0, 0x18 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U13, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U14, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U15, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vldrepl.d U8, A0, 0x08 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vld U12, B0, 0x20 + vld U13, B0, 0x30 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x40 +.endm + +.macro KERNEL2x1x4_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x8x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 +.endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x2_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x10 + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + vldrepl.d U8, A0, 0x08 + vld U12, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x2_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x8x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vldrepl.d U4, B0, 0x00 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vld U8, A0, 0x40 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x50 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x60 + vld U11, A0, 0x70 + + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x1_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vldrepl.d U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + + + PROLOGUE + + addi.d $sp, $sp, -112 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + ST $f30, $sp, 88 + ST $f31, $sp, 96 + ST ALPHA, $sp, 104 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + preld 0, C0, 0x20 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + preld 0, C1, 0x20 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + vldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + preld 0, C2, 0x20 + vfmul.d D10, U2, U6 + vfmul.d D11, U3, U6 + + vldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + preld 0, C3, 0x20 + vfmul.d D14, U2, U7 + vfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x8x4 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + +.L_TL1_END: + KERNEL8x8x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + /* Cumulative D0~D15 */ + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D10, D10, VALPHA + vfmul.d D11, D11, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA + vfmul.d D14, D14, VALPHA + vfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ + vld U8, C2, 0x00 + vld U9, C2, 0x10 + vld U10, C2, 0x20 + vld U11, C2, 0x30 + vfmadd.d D8, D8, VALPHA, U8 + vfmadd.d D9, D9, VALPHA, U9 + vfmadd.d D10, D10, VALPHA, U10 + vfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + vld U0, C3, 0x00 + vld U1, C3, 0x10 + vld U2, C3, 0x20 + vld U3, C3, 0x30 + vfmadd.d D12, D12, VALPHA, U0 + vfmadd.d D13, D13, VALPHA, U1 + vfmadd.d D14, D14, VALPHA, U2 + vfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + vst D10, C2, 0x20 + vst D11, C2, 0x30 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + vst D14, C3, 0x20 + vst D15, C3, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_M0 + + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + vldrepl.d U6, B0, 0x10 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + + vldrepl.d U7, B0, 0x18 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ + vld U4, C2, 0x00 + vld U5, C2, 0x10 + vfmadd.d D8, D8, VALPHA, U4 + vfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ + vld U6, C3, 0x00 + vld U7, C3, 0x10 + vfmadd.d D12, D12, VALPHA, U6 + vfmadd.d D13, D13, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + vfmul.d D2, U1, U4 + vfmul.d D3, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 + vstelm.d D2, C0, 0x08, 0x00 + vstelm.d D2, C1, 0x08, 0x01 + vstelm.d D3, C2, 0x08, 0x00 + vstelm.d D3, C3, 0x08, 0x01 +#else + /* Load C0 */ + vld U0, C0, 0x00 + /* Load C1 */ + vld U1, C1, 0x00 + /* Load C2 */ + vld U2, C2, 0x00 + /* Load C3 */ + vld U3, C3, 0x00 + + vilvl.d D4, D2, D0 //C0 + vilvh.d D5, D2, D0 //C1 + vilvl.d D6, D3, D1 //C2 + vilvh.d D7, D3, D1 //C3 + + vfmadd.d D0, D4, VALPHA, U0 + vfmadd.d D2, D5, VALPHA, U1 + vfmadd.d D1, D6, VALPHA, U2 + vfmadd.d D3, D7, VALPHA, U3 + + vst D0, C0, 0x00 + vst D2, C1, 0x00 + vst D1, C2, 0x00 + vst D3, C3, 0x00 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vldrepl.d U1, C1, 0x00 + vilvl.d D4, U1, U0 + vfmadd.d D6, D0, VALPHA, D4 + + vldrepl.d U2, C2, 0x00 + vldrepl.d U3, C3, 0x00 + vilvl.d D5, U3, U2 + vfmadd.d D7, D1, VALPHA, D5 + + vstelm.d D6, C0, 0x00, 0x00 + vstelm.d D6, C1, 0x00, 0x01 + vstelm.d D7, C2, 0x00, 0x00 + vstelm.d D7, C3, 0x00, 0x01 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + + vldrepl.d VALPHA, $sp, 104 + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N3_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x8x2_END + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 8, considering M=4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N3_M0 + + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + vldrepl.d U4, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + vfmadd.d D0, U0, U4, D0 + + vfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + vld U1, C1, 0x00 + vfmadd.d D4, D4, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + vst D0, C0, 0x00 + vst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C1, 0x00 + vilvl.d U2, U1, U0 + vfmadd.d D0, D0, VALPHA, U2 +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N1_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a3, a2} + * U1 = {a1, a0} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x8x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x8x1_END + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N1_M0 + + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f24, $sp, 40 + LD $f25, $sp, 48 + LD $f26, $sp, 56 + LD $f27, $sp, 64 + LD $f28, $sp, 72 + LD $f29, $sp, 80 + LD $f30, $sp, 88 + LD $f31, $sp, 96 + addi.d $sp, $sp, 112 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/param.h b/param.h index ee4640f57..e4e242d5d 100644 --- a/param.h +++ b/param.h @@ -2888,8 +2888,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 From e771be185ee3ae604ce0b6ffb0dc38258b04f866 Mon Sep 17 00:00:00 2001 From: guxiwei Date: Thu, 21 Dec 2023 14:28:06 +0800 Subject: [PATCH 09/21] Optimize copy functions with lsx. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 8 +- kernel/loongarch64/dgemm_ncopy_4_lsx.S | 185 +++++++ kernel/loongarch64/dgemm_ncopy_8_lsx.S | 283 +++++++++++ kernel/loongarch64/dgemm_tcopy_4_lsx.S | 280 +++++++++++ kernel/loongarch64/dgemm_tcopy_8_lsx.S | 597 +++++++++++++++++++++++ 5 files changed, 1349 insertions(+), 4 deletions(-) create mode 100644 kernel/loongarch64/dgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/dgemm_ncopy_8_lsx.S create mode 100644 kernel/loongarch64/dgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/dgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a78c0dbc5..00cb769eb 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -59,10 +59,10 @@ SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S DGEMMKERNEL = dgemm_kernel_8x4.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = dgemm_ncopy_8_lsx.S +DGEMMITCOPY = dgemm_tcopy_8_lsx.S +DGEMMONCOPY = dgemm_ncopy_4_lsx.S +DGEMMOTCOPY = dgemm_tcopy_4_lsx.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..048a49af6 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S @@ -0,0 +1,185 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 +.L_I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_2I3: + andi I, M, 0x01 + beq ZERO, I, .L_N1 +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..30bebe8df --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -0,0 +1,283 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + push_if_used 26, 32 + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x03 + beq J, ZERO, .L_N4 +.L_J1: + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_I7 +.L_I1: + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ + U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ + U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ + U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ + U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 +.L_4I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..134066471 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -0,0 +1,280 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 18, 8 + + move S0, SRC + move P0, DST + + // Find P0, P2, P3 + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + pop_if_used 18, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..a7e3ef69c --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -0,0 +1,597 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x03 + srai.d T1, N, 0x02 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x02 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x01 + slli.d T0, T0, 0x01 + mul.d P4, M, T0 + slli.d P4, P4, 0x03 + add.d P4, DST, P4 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x06 + beq ZERO, J, .L_M7 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x03 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S5, 0x20 + vld U3, S5, 0x30 + vld U4, S6, 0x00 + vld U5, S6, 0x10 + vld U6, S6, 0x20 + vld U7, S6, 0x30 + + vst U0, P1, 0x100 + vst U1, P1, 0x110 + vst U2, P1, 0x120 + vst U3, P1, 0x130 + vst U4, P1, 0x140 + vst U5, P1, 0x150 + vst U6, P1, 0x160 + vst U7, P1, 0x170 + + vld U0, S7, 0x00 + vld U1, S7, 0x10 + vld U2, S7, 0x20 + vld U3, S7, 0x30 + vld U4, S8, 0x00 + vld U5, S8, 0x10 + vld U6, S8, 0x20 + vld U7, S8, 0x30 + + vst U0, P1, 0x180 + vst U1, P1, 0x190 + vst U2, P1, 0x1a0 + vst U3, P1, 0x1b0 + vst U4, P1, 0x1c0 + vst U5, P1, 0x1d0 + vst U6, P1, 0x1e0 + vst U7, P1, 0x1f0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S6, 0x00 + vld U3, S6, 0x10 + vld U4, S7, 0x00 + vld U5, S7, 0x10 + vld U6, S8, 0x00 + vld U7, S8, 0x10 + + vst U0, P2, 0x80 + vst U1, P2, 0x90 + vst U2, P2, 0xa0 + vst U3, P2, 0xb0 + vst U4, P2, 0xc0 + vst U5, P2, 0xd0 + vst U6, P2, 0xe0 + vst U7, P2, 0xf0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + vld U4, S5, 0x00 + vld U5, S6, 0x00 + vld U6, S7, 0x00 + vld U7, S8, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + vst U4, P3, 0x40 + vst U5, P3, 0x50 + vst U6, P3, 0x60 + vst U7, P3, 0x70 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x03 + beq ZERO, I, .L_4N7 +.L_4I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x03 + beq ZERO, I, .L_2N7 +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x03 + beq ZERO, I, .L_1N7 +.L_1I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + vld U0, S1, 0x00 + vst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE From 06fd5b5995e66e7b54bb4b8496a3b946cd56212e Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 10:44:02 +0800 Subject: [PATCH 10/21] loongarch64: Add and Refine asum optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- kernel/loongarch64/asum_lasx.S | 257 ++++++++++++++++ kernel/loongarch64/asum_lsx.S | 258 ++++++++++++++++ kernel/loongarch64/casum_lasx.S | 329 +++++++++++++++++++++ kernel/loongarch64/casum_lsx.S | 358 +++++++++++++++++++++++ kernel/loongarch64/dasum_lasx.S | 148 ---------- kernel/loongarch64/dasum_lsx.S | 158 ---------- kernel/loongarch64/sasum_lasx.S | 157 ---------- kernel/loongarch64/sasum_lsx.S | 148 ---------- 10 files changed, 1210 insertions(+), 615 deletions(-) create mode 100644 kernel/loongarch64/asum_lasx.S create mode 100644 kernel/loongarch64/asum_lsx.S create mode 100644 kernel/loongarch64/casum_lasx.S create mode 100644 kernel/loongarch64/casum_lsx.S delete mode 100644 kernel/loongarch64/dasum_lasx.S delete mode 100644 kernel/loongarch64/dasum_lsx.S delete mode 100644 kernel/loongarch64/sasum_lasx.S delete mode 100644 kernel/loongarch64/sasum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 00cb769eb..201427dcd 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S -SASUMKERNEL = sasum_lsx.S -DASUMKERNEL = dasum_lsx.S +SASUMKERNEL = asum_lsx.S +DASUMKERNEL = asum_lsx.S +CASUMKERNEL = casum_lsx.S +ZASUMKERNEL = casum_lsx.S SROTKERNEL = rot_lsx.S DROTKERNEL = rot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e4c45e1fa..e822cb630 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S -SASUMKERNEL = sasum_lasx.S -DASUMKERNEL = dasum_lasx.S +SASUMKERNEL = asum_lasx.S +DASUMKERNEL = asum_lasx.S +CASUMKERNEL = casum_lasx.S +ZASUMKERNEL = casum_lasx.S SROTKERNEL = rot_lasx.S DROTKERNEL = rot_lasx.S diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S new file mode 100644 index 000000000..9a2c031f3 --- /dev/null +++ b/kernel/loongarch64/asum_lasx.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S new file mode 100644 index 000000000..512b01404 --- /dev/null +++ b/kernel/loongarch64/asum_lsx.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S new file mode 100644 index 000000000..caf0ff969 --- /dev/null +++ b/kernel/loongarch64/casum_lasx.S @@ -0,0 +1,329 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res0 $xr19 +#define neg1 $xr20 +#define VT0 $xr21 +#define VT1 $xr22 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfmul.d VX0, neg1, VX2 + xvfmul.d VX1, neg1, VX3 + xvfcmp.clt.d VT0, VX2, res0 + xvfcmp.clt.d VT1, VX3, res0 + xvbitsel.v VX2, VX2, VX0, VT0 + xvbitsel.v VX3, VX3, VX1, VT1 + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S new file mode 100644 index 000000000..4822f2080 --- /dev/null +++ b/kernel/loongarch64/casum_lsx.S @@ -0,0 +1,358 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res0 $vr19 +#define neg1 $vr20 +#define VT0 $vr21 +#define VT1 $vr22 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vld VX0, X, 8 * SIZE + vld VX1, X, 12 * SIZE + addi.d I, I, -1 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res3, VX1, VX0 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfmul.s VX0, neg1, VX2 + vfmul.s VX1, neg1, VX3 + vfcmp.clt.s VT0, VX2, res0 + vfcmp.clt.s VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dasum_lasx.S b/kernel/loongarch64/dasum_lasx.S deleted file mode 100644 index 49de98c40..000000000 --- a/kernel/loongarch64/dasum_lasx.S +++ /dev/null @@ -1,148 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define VT0 $xr23 -#define VT1 $xr22 -#define res1 $xr16 -#define res2 $xr17 -#define res0 $xr18 -#define neg1 $xr19 - - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - xvxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d t1, -1 - xvreplgr2vr.d neg1, t1 - xvffint.d.l neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvfmul.d VX2, neg1, VX0 - xvfmul.d VX3, neg1, VX1 - xvfcmp.clt.d VT0, VX0, res0 - xvfcmp.clt.d VT1, VX1, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvbitsel.v VX1, VX1, VX3, VT1 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvfmul.d VX2, neg1, VX0 - xvfmul.d VX3, neg1, VX1 - xvfcmp.clt.d VT0, VX0, res0 - xvfcmp.clt.d VT1, VX1, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvbitsel.v VX1, VX1, VX3, VT1 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dasum_lsx.S b/kernel/loongarch64/dasum_lsx.S deleted file mode 100644 index 94750815e..000000000 --- a/kernel/loongarch64/dasum_lsx.S +++ /dev/null @@ -1,158 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define VT0 $vr23 -#define VT1 $vr22 -#define res1 $vr16 -#define res2 $vr17 -#define res0 $vr18 -#define neg1 $vr19 - - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - vxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d t1, -1 - vreplgr2vr.d neg1, t1 - vffint.d.l neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lasx.S b/kernel/loongarch64/sasum_lasx.S deleted file mode 100644 index a452701aa..000000000 --- a/kernel/loongarch64/sasum_lasx.S +++ /dev/null @@ -1,157 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define VT0 $xr23 -#define VT1 $xr22 -#define res1 $xr16 -#define res2 $xr17 -#define res0 $xr18 -#define neg1 $xr19 - - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - xvxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.w t1, -1 - xvreplgr2vr.w neg1, t1 - xvffint.s.w neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvfmul.s VX2, neg1, VX0 - xvfcmp.clt.s VT0, VX0, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvfadd.s res1, VX0, res1 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmul.s VX2, neg1, VX0 - xvfcmp.clt.s VT0, VX0, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvfadd.s res1, VX0, res1 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lsx.S b/kernel/loongarch64/sasum_lsx.S deleted file mode 100644 index 87026a144..000000000 --- a/kernel/loongarch64/sasum_lsx.S +++ /dev/null @@ -1,148 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define VT0 $vr23 -#define VT1 $vr22 -#define res1 $vr16 -#define res2 $vr17 -#define res0 $vr18 -#define neg1 $vr19 - - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - vxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.w t1, -1 - vreplgr2vr.w neg1, t1 - vffint.s.w neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX2, neg1, VX0 - vfmul.s VX3, neg1, VX1 - vfcmp.clt.s VT0, VX0, res0 - vfcmp.clt.s VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfmul.s VX2, neg1, VX0 - vfmul.s VX3, neg1, VX1 - vfcmp.clt.s VT0, VX0, res0 - vfcmp.clt.s VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 0753848e03e1298c162386df467f78bc15851cc4 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 16:54:01 +0800 Subject: [PATCH 11/21] loongarch64: Refine and add axpy optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- .../loongarch64/{daxpy_lasx.S => axpy_lasx.S} | 237 +++++- .../loongarch64/{daxpy_lsx.S => axpy_lsx.S} | 254 ++++++- kernel/loongarch64/caxpy_lasx.S | 707 ++++++++++++++++++ kernel/loongarch64/caxpy_lsx.S | 679 +++++++++++++++++ kernel/loongarch64/saxpy_lasx.S | 323 -------- kernel/loongarch64/saxpy_lsx.S | 338 --------- 8 files changed, 1839 insertions(+), 711 deletions(-) rename kernel/loongarch64/{daxpy_lasx.S => axpy_lasx.S} (52%) rename kernel/loongarch64/{daxpy_lsx.S => axpy_lsx.S} (53%) create mode 100644 kernel/loongarch64/caxpy_lasx.S create mode 100644 kernel/loongarch64/caxpy_lsx.S delete mode 100644 kernel/loongarch64/saxpy_lasx.S delete mode 100644 kernel/loongarch64/saxpy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 201427dcd..bdde126ad 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lsx.S SSWAPKERNEL = swap_lsx.S DSWAPKERNEL = swap_lsx.S -SAXPYKERNEL = saxpy_lsx.S -DAXPYKERNEL = daxpy_lsx.S +SAXPYKERNEL = axpy_lsx.S +DAXPYKERNEL = axpy_lsx.S +CAXPYKERNEL = caxpy_lsx.S +ZAXPYKERNEL = caxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e822cb630..7642b2a4d 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lasx.S SSWAPKERNEL = swap_lasx.S DSWAPKERNEL = swap_lasx.S -SAXPYKERNEL = saxpy_lasx.S -DAXPYKERNEL = daxpy_lasx.S +SAXPYKERNEL = axpy_lasx.S +DAXPYKERNEL = axpy_lasx.S +CAXPYKERNEL = caxpy_lasx.S +ZAXPYKERNEL = caxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/axpy_lasx.S similarity index 52% rename from kernel/loongarch64/daxpy_lasx.S rename to kernel/loongarch64/axpy_lasx.S index bafd871ab..707fd09b5 100644 --- a/kernel/loongarch64/daxpy_lasx.S +++ b/kernel/loongarch64/axpy_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 +#else + xvreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -70,6 +102,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -77,6 +116,7 @@ .align 3 .L112: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -86,6 +126,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L112 @@ -97,11 +144,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -114,6 +161,7 @@ .align 3 .L121: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -158,6 +206,50 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -169,11 +261,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -185,6 +277,7 @@ .align 3 .L211: +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -217,6 +310,37 @@ addi.d I, I, -1 xvst VX3, Y, 4 * SIZE addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE +#endif blt $r0, I, .L211 .align 3 @@ -226,11 +350,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -243,6 +367,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -309,6 +434,73 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -319,15 +511,14 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 - b .L999 .align 3 .L999: diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/axpy_lsx.S similarity index 53% rename from kernel/loongarch64/daxpy_lsx.S rename to kernel/loongarch64/axpy_lsx.S index fc88f0bb9..0d74e2bce 100644 --- a/kernel/loongarch64/daxpy_lsx.S +++ b/kernel/loongarch64/axpy_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE vreplgr2vr.d VXA, t1 +#else + vreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -75,16 +107,27 @@ vld VX3, Y, 6 * SIZE vfadd.d VX2, VX0, VX2 vfadd.d VX3, VX1, VX3 - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L111 b .L113 .align 3 .L112: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -104,6 +147,19 @@ vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE addi.d Y, Y, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 +#endif blt $r0, I, .L112 .align 3 @@ -113,11 +169,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -130,6 +186,7 @@ .align 3 .L121: +#ifdef DOUBLE vld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -180,6 +237,54 @@ add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L121 .align 3 @@ -189,11 +294,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -205,6 +310,7 @@ .align 3 .L211: +#ifdef DOUBLE vld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -242,6 +348,39 @@ vfmadd.d VX3, VX1, VXA, VX3 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 .align 3 @@ -252,11 +391,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -269,6 +408,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -337,6 +477,74 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -347,11 +555,11 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S new file mode 100644 index 000000000..2b970fe70 --- /dev/null +++ b/kernel/loongarch64/caxpy_lasx.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE +#endif + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif + addi.d I, I, -1 +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 +#else + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S new file mode 100644 index 000000000..85598d0b9 --- /dev/null +++ b/kernel/loongarch64/caxpy_lsx.S @@ -0,0 +1,679 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + vreplgr2vr.w VXAI, t2 +#endif + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 +#endif + add.d Y, Y, INCY +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S deleted file mode 100644 index 609e26328..000000000 --- a/kernel/loongarch64/saxpy_lasx.S +++ /dev/null @@ -1,323 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define XX $r5 -#define YY $r6 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VXA $xr23 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L999 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - xvreplgr2vr.w VXA, t1 - - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L113 - fcmp.ceq.s $fcc0, ALPHA, a2 - bceqz $fcc0, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvfadd.s VX2, VX0, VX2 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - b .L113 - .align 3 - -.L112: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvfmadd.s VX2, VX0, VXA, VX2 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L112 - .align 3 - -.L113: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L114: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L114 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - move YY, Y - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - move YY, Y - .align 3 - -.L222: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S deleted file mode 100644 index f47415ed6..000000000 --- a/kernel/loongarch64/saxpy_lsx.S +++ /dev/null @@ -1,338 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define XX $r5 -#define YY $r6 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VXA $vr23 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L999 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - vreplgr2vr.w VXA, t1 - - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L113 - fcmp.ceq.s $fcc0, ALPHA, a2 - bceqz $fcc0, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfadd.s VX2, VX0, VX2 - vfadd.s VX3, VX1, VX3 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L113 - .align 3 - -.L112: - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmadd.s VX2, VX0, VXA, VX2 - vfmadd.s VX3, VX1, VXA, VX3 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L113 - .align 3 - -.L113: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L114: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L114 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - move YY, Y - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX2, VX0, VXA, VX2 - vld VX1, X, 4 * SIZE - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmadd.s VX2, VX0, VXA, VX2 - vld VX3, Y, 4 * SIZE - vst VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - move YY, Y - .align 3 - -.L222: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX2, VX0, VXA, VX2 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 8785e948b534a7747611b36b46a69f5813a18cc2 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 17:04:46 +0800 Subject: [PATCH 12/21] loongarch64: Add camin optimization function. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 1 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 1 + kernel/loongarch64/camin_lasx.S | 199 +++++++++++++++++++++ kernel/loongarch64/camin_lsx.S | 211 +++++++++++++++++++++++ 4 files changed, 412 insertions(+) create mode 100644 kernel/loongarch64/camin_lasx.S create mode 100644 kernel/loongarch64/camin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index bdde126ad..fc1766ff5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S +CAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 7642b2a4d..7de9d4440 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S +CAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S new file mode 100644 index 000000000..d7931d30a --- /dev/null +++ b/kernel/loongarch64/camin_lasx.S @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define TEMP $r16 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + MTC s1, $r0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + xvreplve0.w VM0, VM0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmin.s VM1, x1, x2 + xvfmin.s VM0, x3, x4 + xvfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S new file mode 100644 index 000000000..e9ad6b04d --- /dev/null +++ b/kernel/loongarch64/camin_lsx.S @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + MTC s1, $r0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + vreplvei.w VM0, VM0, 0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmin.s VM1, x1, VM1 + vfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmin.s VM1, x1, x2 + vfmin.s VM0, x3, x4 + vfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 2a34fb4b80d494d76da7a9ea6a3d54dffbd57f37 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 18:17:51 +0800 Subject: [PATCH 13/21] loongarch64: Add and refine scal optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- kernel/loongarch64/cscal_lasx.S | 645 +++++++++++++++++++++++ kernel/loongarch64/cscal_lsx.S | 571 ++++++++++++++++++++ kernel/loongarch64/scal_lasx.S | 282 ++++++++++ kernel/loongarch64/scal_lsx.S | 301 +++++++++++ kernel/loongarch64/sscal_lasx.S | 188 ------- kernel/loongarch64/sscal_lsx.S | 194 ------- 8 files changed, 1807 insertions(+), 386 deletions(-) create mode 100644 kernel/loongarch64/cscal_lasx.S create mode 100644 kernel/loongarch64/cscal_lsx.S create mode 100644 kernel/loongarch64/scal_lasx.S create mode 100644 kernel/loongarch64/scal_lsx.S delete mode 100644 kernel/loongarch64/sscal_lasx.S delete mode 100644 kernel/loongarch64/sscal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index fc1766ff5..7abdae55a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -4,8 +4,10 @@ SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S -SSCALKERNEL = sscal_lsx.S -DSCALKERNEL = dscal_lsx.S +SSCALKERNEL = scal_lsx.S +DSCALKERNEL = scal_lsx.S +CSCALKERNEL = cscal_lsx.S +ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 7de9d4440..13f9f23ed 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -4,8 +4,10 @@ SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S -SSCALKERNEL = sscal_lasx.S -DSCALKERNEL = dscal_lasx.S +SSCALKERNEL = scal_lasx.S +DSCALKERNEL = scal_lasx.S +CSCALKERNEL = cscal_lasx.S +ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S new file mode 100644 index 000000000..3605a6c0e --- /dev/null +++ b/kernel/loongarch64/cscal_lasx.S @@ -0,0 +1,645 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + xvreplgr2vr.d VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + xvreplgr2vr.w VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 +#else + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S new file mode 100644 index 000000000..f442a754f --- /dev/null +++ b/kernel/loongarch64/cscal_lsx.S @@ -0,0 +1,571 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 +#endif + vxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + vst VXZ, X, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, X, 6 * SIZE +#else + vst VXZ, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 +#else + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S new file mode 100644 index 000000000..48e2c0718 --- /dev/null +++ b/kernel/loongarch64/scal_lasx.S @@ -0,0 +1,282 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA, TEMP +#else + xvreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VT0, VX0, VALPHA + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE +#ifdef DOUBLE + xvst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA , TEMP +#else + xvreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE +#else + xvfmul.s VT0, VX0, VALPHA + xvst VT0, X, 0 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S new file mode 100644 index 000000000..1ffce7db2 --- /dev/null +++ b/kernel/loongarch64/scal_lsx.S @@ -0,0 +1,301 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA, TEMP +#else + vreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + vfmul.s VT0, VX0, VALPHA + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vfmul.s VT1, VX1, VALPHA + vstelm.w VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 3 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 + +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE +#ifdef DOUBLE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE +#else + vst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA , TEMP +#else + vreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VT0, VX0, VALPHA + vfmul.s VT1, VX1, VALPHA + addi.d I, I, -1 + vst VT0, X, 0 * SIZE + vst VT1, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sscal_lasx.S b/kernel/loongarch64/sscal_lasx.S deleted file mode 100644 index 329f24659..000000000 --- a/kernel/loongarch64/sscal_lasx.S +++ /dev/null @@ -1,188 +0,0 @@ -#define ASSEMBLER -#include "common.h" - -#define N $r4 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define XX $r16 -#define VX0 $xr12 -#define VX1 $xr13 -#define VT0 $xr14 -#define VT1 $xr15 -#define VALPHA $xr19 -#define a1 $f8 -#define a2 $f23 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 - fcmp.ceq.s $fcc0, ALPHA, a2 - bcnez $fcc0, .L999 //ALPHA==1 return - srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 - movfr2gr.s TEMP, ALPHA - xvreplgr2vr.w VALPHA, TEMP - move XX, X - -.L10: //ALPHA!=0|1 and INCX!=1 - bge $r0, I, .L32 - .align 3 -.L11: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmul.s VT0, VX0, VALPHA - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L11 - b .L32 - .align 3 - -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L23: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvst VX0, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L27: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - -.L30: - bge $r0, I, .L32/*N<8 INCX==1*/ - movfr2gr.s TEMP, ALPHA - xvreplgr2vr.w VALPHA , TEMP - .align 3 - -.L31: - xvld VX0, X, 0 * SIZE - addi.d I, I, -1 - xvfmul.s VT0, VX0, VALPHA - xvst VT0, X, 0 * SIZE - addi.d X, X, 8 * SIZE - blt $r0, I, .L31 - .align 3 - -.L32: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L33: - fld.s a1, X, 0 * SIZE - addi.d I, I, -1 - fmul.s a1, ALPHA, a1 - fst.s a1, X, 0 * SIZE - add.d X, X, INCX - blt $r0, I, .L33 - jirl $r0, $r1, 0 - .align 3 - -.L999: - jirl $r0, $r1, 0x0 - - EPILOGUE diff --git a/kernel/loongarch64/sscal_lsx.S b/kernel/loongarch64/sscal_lsx.S deleted file mode 100644 index d0ea1307d..000000000 --- a/kernel/loongarch64/sscal_lsx.S +++ /dev/null @@ -1,194 +0,0 @@ -#define ASSEMBLER -#include "common.h" - -#define N $r4 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define XX $r16 -#define VX0 $vr12 -#define VX1 $vr13 -#define VT0 $vr14 -#define VT1 $vr15 -#define VALPHA $vr19 -#define a1 $f8 -#define a2 $f23 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 - fcmp.ceq.s $fcc0, ALPHA, a2 - bcnez $fcc0, .L999 //ALPHA==1 return - srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 - movfr2gr.s TEMP, ALPHA - vreplgr2vr.w VALPHA, TEMP - move XX, X - .align 3 - -.L10: //ALPHA!=0|1 and INCX!=1 - bge $r0, I, .L32 - .align 3 -.L11: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - vfmul.s VT0, VX0, VALPHA - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vfmul.s VT1, VX1, VALPHA - vstelm.w VT1, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 3 - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L11 - b .L32 - .align 3 - -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L23: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 -.L25: - vxor.v VX0, VX0, VX0 - vst VX0, X, 0 * SIZE - vst VX0, X, 4 * SIZE - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L27: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - -.L30: - bge $r0, I, .L32/*N<8 INCX==1*/ - movfr2gr.s TEMP, ALPHA - vreplgr2vr.w VALPHA , TEMP - .align 3 - -.L31: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VT0, VX0, VALPHA - vfmul.s VT1, VX1, VALPHA - addi.d I, I, -1 - vst VT0, X, 0 * SIZE - vst VT1, X, 4 * SIZE - addi.d X, X, 8 * SIZE - blt $r0, I, .L31 - .align 3 - -.L32: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L33: - fld.s a1, X, 0 * SIZE - addi.d I, I, -1 - fmul.s a1, ALPHA, a1 - fst.s a1, X, 0 * SIZE - add.d X, X, INCX - blt $r0, I, .L33 - jirl $r0, $r1, 0 - .align 3 - -.L999: - jirl $r0, $r1, 0x0 - - EPILOGUE From 65a0aeb128a46b75b189d3a21b619d005351b2d2 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 17:45:17 +0800 Subject: [PATCH 14/21] loongarch64: Add c/zcopy optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/ccopy_lasx.S | 386 +++++++++++++++++++++ kernel/loongarch64/ccopy_lsx.S | 411 +++++++++++++++++++++++ 4 files changed, 801 insertions(+) create mode 100644 kernel/loongarch64/ccopy_lasx.S create mode 100644 kernel/loongarch64/ccopy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 7abdae55a..2aa68af67 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S +CCOPYKERNEL = ccopy_lsx.S +ZCOPYKERNEL = ccopy_lsx.S SSWAPKERNEL = swap_lsx.S DSWAPKERNEL = swap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 13f9f23ed..3bcec2c62 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S +CCOPYKERNEL = ccopy_lasx.S +ZCOPYKERNEL = ccopy_lasx.S SSWAPKERNEL = swap_lasx.S DSWAPKERNEL = swap_lasx.S diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S new file mode 100644 index 000000000..fbc5d96bc --- /dev/null +++ b/kernel/loongarch64/ccopy_lasx.S @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0 * SIZE, 2 + xvstelm.d VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 0 + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 2 + xvstelm.d VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 0 + xvstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 2 + xvstelm.d VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 0 + xvstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 2 + xvstelm.d VX3, Y, 1 * SIZE, 3 +#else + xvld VX1, X, 8 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 2 + xvstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 4 + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 6 + xvstelm.w VX0, Y, 1 * SIZE, 7 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 0 + xvstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 2 + xvstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 4 + xvstelm.w VX1, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 6 + xvstelm.w VX1, Y, 1 * SIZE, 7 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S new file mode 100644 index 000000000..4c4d880f1 --- /dev/null +++ b/kernel/loongarch64/ccopy_lsx.S @@ -0,0 +1,411 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11:// INCX==1 and INCY==1 + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + addi.d I, I, -1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + vst VX2, Y, 12 * SIZE + vst VX3, Y, 14 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0 * SIZE, 2 + vstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 0 + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 2 + vstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 0 + vstelm.w VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 2 + vstelm.w VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 0 + vstelm.w VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 2 + vstelm.w VX3, Y, 1 * SIZE, 3 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 12 * SIZE + vst VX1, Y, 14 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d Y, Y, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From d97272cb35af5849515b6a5850ddfe642e29430f Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 19:09:18 +0800 Subject: [PATCH 15/21] loongarch64: Add c/zdot optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/cdot_lasx.S | 565 +++++++++++++++++++++++ kernel/loongarch64/cdot_lsx.S | 397 ++++++++++++++++ 4 files changed, 966 insertions(+) create mode 100644 kernel/loongarch64/cdot_lasx.S create mode 100644 kernel/loongarch64/cdot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 2aa68af67..c70120f9a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -3,6 +3,8 @@ ifndef NO_LSX SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +CDOTKERNEL = cdot_lsx.S +ZDOTKERNEL = cdot_lsx.S SSCALKERNEL = scal_lsx.S DSCALKERNEL = scal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3bcec2c62..98673ae09 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -3,6 +3,8 @@ ifndef NO_LASX SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +CDOTKERNEL = cdot_lasx.S +ZDOTKERNEL = cdot_lasx.S SSCALKERNEL = scal_lasx.S DSCALKERNEL = scal_lasx.S diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S new file mode 100644 index 000000000..0583e56ea --- /dev/null +++ b/kernel/loongarch64/cdot_lasx.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res4 $xr19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define x1 $xr20 +#define x2 $xr21 +#define x3 $xr22 +#define x4 $xr23 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res3, res3, res3 + xvxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + addi.d X, X, 8 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + addi.d X, X, 16 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d Y, Y, 8 * SIZE + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d Y, Y, 8 * SIZE + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + xvpickve.d VX1, res2, 1 + xvpickve.d VX2, res2, 2 + xvpickve.d VX3, res2, 3 + xvfadd.d res2, VX1, res2 + xvfadd.d res2, VX2, res2 + xvfadd.d res2, VX3, res2 + xvpickve.d VX1, res3, 1 + xvpickve.d VX2, res3, 2 + xvpickve.d VX3, res3, 3 + xvfadd.d res3, VX1, res3 + xvfadd.d res3, VX2, res3 + xvfadd.d res3, VX3, res3 + xvpickve.d VX1, res4, 1 + xvpickve.d VX2, res4, 2 + xvpickve.d VX3, res4, 3 + xvfadd.d res4, VX1, res4 + xvfadd.d res4, VX2, res4 + xvfadd.d res4, VX3, res4 +#else + xvpickve.w VX0, res1, 1 + xvpickve.w VX1, res1, 2 + xvpickve.w VX2, res1, 3 + xvpickve.w VX3, res1, 4 + xvpickve.w x1, res1, 5 + xvpickve.w x2, res1, 6 + xvpickve.w x3, res1, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, x1, res1 + xvfadd.s res1, x2, res1 + xvfadd.s res1, x3, res1 + xvpickve.w VX0, res2, 1 + xvpickve.w VX1, res2, 2 + xvpickve.w VX2, res2, 3 + xvpickve.w VX3, res2, 4 + xvpickve.w x1, res2, 5 + xvpickve.w x2, res2, 6 + xvpickve.w x3, res2, 7 + xvfadd.s res2, VX0, res2 + xvfadd.s res2, VX1, res2 + xvfadd.s res2, VX2, res2 + xvfadd.s res2, VX3, res2 + xvfadd.s res2, x1, res2 + xvfadd.s res2, x2, res2 + xvfadd.s res2, x3, res2 + xvpickve.w VX0, res3, 1 + xvpickve.w VX1, res3, 2 + xvpickve.w VX2, res3, 3 + xvpickve.w VX3, res3, 4 + xvpickve.w x1, res3, 5 + xvpickve.w x2, res3, 6 + xvpickve.w x3, res3, 7 + xvfadd.s res3, VX0, res3 + xvfadd.s res3, VX1, res3 + xvfadd.s res3, VX2, res3 + xvfadd.s res3, VX3, res3 + xvfadd.s res3, x1, res3 + xvfadd.s res3, x2, res3 + xvfadd.s res3, x3, res3 + xvpickve.w VX0, res4, 1 + xvpickve.w VX1, res4, 2 + xvpickve.w VX2, res4, 3 + xvpickve.w VX3, res4, 4 + xvpickve.w x1, res4, 5 + xvpickve.w x2, res4, 6 + xvpickve.w x3, res4, 7 + xvfadd.s res4, VX0, res4 + xvfadd.s res4, VX1, res4 + xvfadd.s res4, VX2, res4 + xvfadd.s res4, VX3, res4 + xvfadd.s res4, x1, res4 + xvfadd.s res4, x2, res4 + xvfadd.s res4, x3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S new file mode 100644 index 000000000..5feea12be --- /dev/null +++ b/kernel/loongarch64/cdot_lsx.S @@ -0,0 +1,397 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res4 $vr19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define x1 $vr20 +#define x2 $vr21 +#define x3 $vr22 +#define x4 $vr23 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res3, res3, res3 + vxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 1 +#else + srai.d I, N, 2 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + addi.d X, X, 4 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + addi.d X, X, 8 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + addi.d Y, Y, 4 * SIZE + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d Y, Y, 8 * SIZE + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + vreplvei.d VX1, res2, 1 + vfadd.d res2, VX1, res2 + vreplvei.d VX1, res3, 1 + vfadd.d res3, VX1, res3 + vreplvei.d VX1, res4, 1 + vfadd.d res4, VX1, res4 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + vreplvei.w VX1, res2, 1 + vreplvei.w VX2, res2, 2 + vreplvei.w VX3, res2, 3 + vfadd.s res2, VX1, res2 + vfadd.s res2, VX2, res2 + vfadd.s res2, VX3, res2 + vreplvei.w VX1, res3, 1 + vreplvei.w VX2, res3, 2 + vreplvei.w VX3, res3, 3 + vfadd.s res3, VX1, res3 + vfadd.s res3, VX2, res3 + vfadd.s res3, VX3, res3 + vreplvei.w VX1, res4, 1 + vreplvei.w VX2, res4, 2 + vreplvei.w VX3, res4, 3 + vfadd.s res4, VX1, res4 + vfadd.s res4, VX2, res4 + vfadd.s res4, VX3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From fbd612f8c4f3eceb16a645c8a5366af35e7b6a2e Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 20:07:58 +0800 Subject: [PATCH 16/21] loongarch64: Add ic/zamin optimization functions. --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/icamin_lasx.S | 555 +++++++++++++++++++++++ kernel/loongarch64/icamin_lsx.S | 425 +++++++++++++++++ 5 files changed, 988 insertions(+) create mode 100644 kernel/loongarch64/icamin_lasx.S create mode 100644 kernel/loongarch64/icamin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 599b4795c..e581e2e3e 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -133,6 +133,7 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.d #define XVFADD xvfadd.d +#define XVFMUL xvfmul.d #define XVFMADD xvfmadd.d #define XVFMIN xvfmin.d #define XVFMINA xvfmina.d @@ -146,6 +147,7 @@ static inline int WhereAmI(void){ #define VFSUB vfsub.d #define VFADD vfadd.d +#define VFMUL vfmul.d #define VFMADD vfmadd.d #define VFMIN vfmin.d #define VFMINA vfmina.d @@ -185,6 +187,7 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.s #define XVFADD xvfadd.s +#define XVFMUL xvfmul.s #define XVFMADD xvfmadd.s #define XVFMIN xvfmin.s #define XVFMINA xvfmina.s @@ -198,6 +201,7 @@ static inline int WhereAmI(void){ #define VFSUB vfsub.s #define VFADD vfadd.s +#define VFMUL vfmul.s #define VFMADD vfmadd.s #define VFMIN vfmin.s #define VFMINA vfmina.s diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c70120f9a..5e2632574 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lsx.S ISAMINKERNEL = iamin_lsx.S IDAMINKERNEL = iamin_lsx.S +ICAMINKERNEL = icamin_lsx.S +IZAMINKERNEL = icamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 98673ae09..20a4d9a7e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lasx.S ISAMINKERNEL = iamin_lasx.S IDAMINKERNEL = iamin_lasx.S +ICAMINKERNEL = icamin_lasx.S +IZAMINKERNEL = icamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S new file mode 100644 index 000000000..01abd45b2 --- /dev/null +++ b/kernel/loongarch64/icamin_lasx.S @@ -0,0 +1,555 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + xvreplve0.w VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmin.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmin.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmin.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + XVFMUL x3, VI4, x1 + XVFMUL x4, VI4, x2 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 +#endif + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S new file mode 100644 index 000000000..a08cd33c5 --- /dev/null +++ b/kernel/loongarch64/icamin_lsx.S @@ -0,0 +1,425 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 + vreplvei.w VM0, VM0, 0 + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 +#endif + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 3c53ded315901759f0ee2a77d07121c8905fb18d Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 20:26:01 +0800 Subject: [PATCH 17/21] loongarch64: Add c/znrm2 optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/cnrm2_lasx.S | 147 +++++++++++++ kernel/loongarch64/cnrm2_lsx.S | 155 ++++++++++++++ kernel/loongarch64/znrm2_lasx.S | 252 ++++++++++++++++++++++ kernel/loongarch64/znrm2_lsx.S | 260 +++++++++++++++++++++++ 6 files changed, 818 insertions(+) create mode 100644 kernel/loongarch64/cnrm2_lasx.S create mode 100644 kernel/loongarch64/cnrm2_lsx.S create mode 100644 kernel/loongarch64/znrm2_lasx.S create mode 100644 kernel/loongarch64/znrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 5e2632574..826588318 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 20a4d9a7e..b61ecd427 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE From 1ec5dded43b9d5e14875061f62d5edec480f9584 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 21:23:59 +0800 Subject: [PATCH 18/21] loongarch64: Add c/zrot optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/crot_lasx.S | 1079 ++++++++++++++++++++++ kernel/loongarch64/crot_lsx.S | 907 ++++++++++++++++++ 4 files changed, 1990 insertions(+) create mode 100644 kernel/loongarch64/crot_lasx.S create mode 100644 kernel/loongarch64/crot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 826588318..02ea4304e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -67,6 +67,8 @@ ZASUMKERNEL = casum_lsx.S SROTKERNEL = rot_lsx.S DROTKERNEL = rot_lsx.S +CROTKERNEL = crot_lsx.S +ZROTKERNEL = crot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index b61ecd427..462698f85 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -67,6 +67,8 @@ ZASUMKERNEL = casum_lasx.S SROTKERNEL = rot_lasx.S DROTKERNEL = rot_lasx.S +CROTKERNEL = crot_lasx.S +ZROTKERNEL = crot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S new file mode 100644 index 000000000..d4ec1e22c --- /dev/null +++ b/kernel/loongarch64/crot_lasx.S @@ -0,0 +1,1079 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr11 +#define x1 $xr12 +#define x2 $xr13 +#define x3 $xr14 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXC, t1 + xvreplgr2vr.d VXS, t2 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXC, t1 + xvreplgr2vr.w VXS, t2 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + xvst VXZ, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 +#else + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 + .align 3 + +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S new file mode 100644 index 000000000..126257edc --- /dev/null +++ b/kernel/loongarch64/crot_lsx.S @@ -0,0 +1,907 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr11 +#define x1 $vr12 +#define x2 $vr13 +#define x3 $vr14 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXC, t1 + vreplgr2vr.d VXS, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXC, t1 + vreplgr2vr.w VXS, t2 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 2 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: +#ifdef DOUBLE + srai.d I, N, 2 +#endif + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 +#endif + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 +#endif + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 +#endif + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L995 +#else + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 +#endif + .align 3 + +#ifdef DOUBLE + .L995: + andi I, N, 3 + bge $r0, I, .L999 + b .L998 + .align 3 + +#endif +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From edabb936681bff6fea0454aedf064c19c1db217f Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 29 Dec 2023 15:08:10 +0800 Subject: [PATCH 19/21] loongarch64: Refine axpby optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- .../{daxpby_lasx.S => axpby_lasx.S} | 475 ++++++++++++- .../loongarch64/{daxpby_lsx.S => axpby_lsx.S} | 503 +++++++++++++- kernel/loongarch64/saxpby_lasx.S | 597 ----------------- kernel/loongarch64/saxpby_lsx.S | 629 ------------------ 6 files changed, 931 insertions(+), 1281 deletions(-) rename kernel/loongarch64/{daxpby_lasx.S => axpby_lasx.S} (55%) rename kernel/loongarch64/{daxpby_lsx.S => axpby_lsx.S} (56%) delete mode 100644 kernel/loongarch64/saxpby_lasx.S delete mode 100644 kernel/loongarch64/saxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 02ea4304e..0fb0bb68f 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -54,8 +54,8 @@ DAXPYKERNEL = axpy_lsx.S CAXPYKERNEL = caxpy_lsx.S ZAXPYKERNEL = caxpy_lsx.S -SAXPBYKERNEL = saxpby_lsx.S -DAXPBYKERNEL = daxpby_lsx.S +SAXPBYKERNEL = axpby_lsx.S +DAXPBYKERNEL = axpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 462698f85..1a6a04532 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -54,8 +54,8 @@ DAXPYKERNEL = axpy_lasx.S CAXPYKERNEL = caxpy_lasx.S ZAXPYKERNEL = caxpy_lasx.S -SAXPBYKERNEL = saxpby_lasx.S -DAXPBYKERNEL = daxpby_lasx.S +SAXPBYKERNEL = axpby_lasx.S +DAXPBYKERNEL = axpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/axpby_lasx.S similarity index 55% rename from kernel/loongarch64/daxpby_lasx.S rename to kernel/loongarch64/axpby_lasx.S index 4b19703e7..f1d99cd3b 100644 --- a/kernel/loongarch64/daxpby_lasx.S +++ b/kernel/loongarch64/axpby_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA xvreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + xvreplgr2vr.w VXA, t1 + xvreplgr2vr.w VXB, t2 + xvreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,21 +85,22 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 .L111: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE @@ -77,6 +111,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -85,34 +126,46 @@ .L112: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA xvst VX0, Y, 0 * SIZE xvst VX1, Y, 4 * SIZE +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L112 b .L997 .align 3 .L113: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d I, I, -1 + addi.d Y, Y, 8 * SIZE blt $r0, I, .L113 b .L997 .align 3 .L114: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -122,21 +175,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -182,14 +236,59 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA @@ -208,14 +307,33 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -250,7 +368,6 @@ xvstelm.d VX2, YY, 0, 3 add.d YY, YY, INCY xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 1 @@ -258,12 +375,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -279,6 +440,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -287,21 +465,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -334,12 +513,43 @@ xvfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 xvst VX3, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -369,6 +579,35 @@ xvfmul.d VX1, VX1, VXA addi.d I, I, -1 xvst VX1, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -376,20 +615,27 @@ .L213: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -399,20 +645,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -481,12 +728,81 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -529,12 +845,56 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -577,12 +937,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -598,6 +1002,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -610,12 +1031,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/axpby_lsx.S similarity index 56% rename from kernel/loongarch64/daxpby_lsx.S rename to kernel/loongarch64/axpby_lsx.S index 9aafbaf2a..45154c262 100644 --- a/kernel/loongarch64/daxpby_lsx.S +++ b/kernel/loongarch64/axpby_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE vreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA vreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXA, t1 + vreplgr2vr.w VXB, t2 + vreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,15 +85,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 @@ -68,6 +101,7 @@ .L111: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vld VX3, Y, 2 * SIZE vfmul.d VX0, VX0, VXA @@ -86,6 +120,16 @@ vfmadd.d VX3, VX3, VXB, VX1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -95,6 +139,7 @@ .L112: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -106,6 +151,13 @@ vfmul.d VX3, VX3, VXA vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,7 +165,8 @@ b .L997 .align 3 -.L113: // ALPHA==0 BETA!=0\ +.L113: // ALPHA==0 BETA!=0 +#ifdef DOUBLE vld VX0, Y, 0 * SIZE vld VX1, Y, 2 * SIZE vfmul.d VX0, VX0, VXB @@ -126,6 +179,14 @@ vfmul.d VX3, VX3, VXB vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L113 @@ -134,9 +195,13 @@ .L114: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -146,21 +211,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -212,6 +278,53 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -220,6 +333,7 @@ .L122: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -242,6 +356,26 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 @@ -250,6 +384,7 @@ .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -294,12 +429,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -315,6 +495,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -323,21 +520,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -378,12 +576,47 @@ vfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -417,6 +650,37 @@ vfmul.d VX1, VX1, VXA addi.d I, I, -1 vst VX1, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -424,6 +688,7 @@ .L213: // ALPHA==0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX3, Y, 2 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB @@ -433,19 +698,30 @@ vld VX3, Y, 6 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -455,20 +731,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -541,12 +818,83 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -591,12 +939,57 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -641,12 +1034,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -662,6 +1100,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -674,12 +1129,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S deleted file mode 100644 index c5d1ff402..000000000 --- a/kernel/loongarch64/saxpby_lasx.S +++ /dev/null @@ -1,597 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define ALPHA $f0 -#define X $r5 -#define INCX $r6 -#define BETA $f1 -#define Y $r7 -#define INCY $r8 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define a2 $f13 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VXA $xr23 -#define VXB $xr9 -#define VXZ $xr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - xvreplgr2vr.w VXA, t1 - movfr2gr.s t2, BETA - xvreplgr2vr.w VXB, t2 - movfr2gr.s t3, a1 - xvreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 - b .L111 // ALPHA!=0 BETA!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L114 // ALPHA==0 BETA==0 - b .L113 // ALPHA==0 BETA!=0 - .align 3 - -.L111: // ALPHA!=0 BETA!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvfmadd.s VX2, VX2, VXB, VX0 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // ALPHA!=0 BETA==0 - xvld VX0, X, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // ALPHA==0 BETA!=0 - xvld VX2, Y, 0 * SIZE - xvfmul.s VX2, VX2, VXB - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // ALPHA==0 BETA==0 - xvst VXZ, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 - b .L121 // ALPHA!=0 BETA!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L124 // ALPHA==0 BETA==0 - b .L123 // ALPHA==0 BETA!=0 - .align 3 - -.L121: // ALPHA!=0 BETA!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX0, VX0, VXA - xvfmadd.s VX2, VX2, VXB, VX0 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // ALPHA!=0 BETA==0 - xvld VX0, X, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX2, VX2, VXB - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // ALPHA==0 BETA==0 - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 - b .L211 // ALPHA!=0 BETA!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L214 // ALPHA==0 BETA==0 - b .L213 // ALPHA==0 BETA!=0 - .align 3 - -.L211: // ALPHA!=0 BETA!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VXA, VX0 - xvfmadd.s VX2, VX2, VXB, VX0 - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VXA, VX0 - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // ALPHA==0 BETA!=0 - xvld VX2, Y, 0 * SIZE - xvfmul.s VX2, VX2, VXB - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // ALPHA==0 BETA==0 - xvst VXZ, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L214 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 - b .L221 // ALPHA!=0 BETA!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L224 // ALPHA==0 BETA==0 - b .L223 // ALPHA==0 BETA!=0 - .align 3 - -.L221: // ALPHA!=0 BETA!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX0, VX0, VXA - xvfmadd.s VX2, VX2, VXB, VX0 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX2, VX2, VXB - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // ALPHA==0 BETA==0 - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - addi.d I, I, -1 - fmul.s $f12, $f12, ALPHA - fmadd.s $f13, $f13, BETA, $f12 - fst.s $f13, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S deleted file mode 100644 index 7f8cea2dd..000000000 --- a/kernel/loongarch64/saxpby_lsx.S +++ /dev/null @@ -1,629 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define ALPHA $f0 -#define X $r5 -#define INCX $r6 -#define BETA $f1 -#define Y $r7 -#define INCY $r8 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define a2 $f13 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VXA $vr23 -#define VXB $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - vreplgr2vr.w VXA, t1 - movfr2gr.s t2, BETA - vreplgr2vr.w VXB, t2 - movfr2gr.s t3, a1 - vreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 - b .L111 // ALPHA!=0 BETA!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L114 // ALPHA==0 BETA==0 - b .L113 // ALPHA==0 BETA!=0 - .align 3 - -.L111: // ALPHA!=0 BETA!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vfmadd.s VX2, VX2, VXB, VX0 - vfmadd.s VX3, VX3, VXB, VX1 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // ALPHA!=0 BETA==0 - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // ALPHA==0 BETA!=0 - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX2, VX2, VXB - vfmul.s VX3, VX3, VXB - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // ALPHA==0 BETA==0 - vst VXZ, Y, 0 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 - b .L121 // ALPHA!=0 BETA!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L124 // ALPHA==0 BETA==0 - b .L123 // ALPHA==0 BETA!=0 - .align 3 - -.L121: // ALPHA!=0 BETA!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, VX0, VXA - vld VX1, X, 4 * SIZE - vfmadd.s VX2, VX2, VXB, VX0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX1, VX1, VXA - vfmadd.s VX3, VX3, VXB, VX1 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // ALPHA!=0 BETA==0 - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX2, VX2, VXB - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX3, VX3, VXB - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // ALPHA==0 BETA==0 - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 - b .L211 // ALPHA!=0 BETA!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L214 // ALPHA==0 BETA==0 - b .L213 // ALPHA==0 BETA!=0 - .align 3 - -.L211: // ALPHA!=0 BETA!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VXA, VX0 - vld VX3, Y, 4 * SIZE - vfmadd.s VX2, VX2, VXB, VX0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX2, Y, 0 * SIZE - vfmul.s VX1, VX1, VXA - vfmadd.s VX3, VX3, VXB, VX1 - addi.d I, I, -1 - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VXA, VX0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX0, Y, 0 * SIZE - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // ALPHA==0 BETA!=0 - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX2, VX2, VXB - vfmul.s VX3, VX3, VXB - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // ALPHA==0 BETA==0 - vst VXZ, Y, 0 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L214 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 - b .L221 // ALPHA!=0 BETA!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L224 // ALPHA==0 BETA==0 - b .L223 // ALPHA==0 BETA!=0 - .align 3 - -.L221: // ALPHA!=0 BETA!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, VX0, VXA - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vfmadd.s VX2, VX2, VXB, VX0 - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vfmadd.s VX3, VX3, VXB, VX1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VX0, VXA - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vstelm.w VX1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX2, VX2, VXB - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX3, VX3, VXB - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // ALPHA==0 BETA==0 - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - addi.d I, I, -1 - fmul.s $f12, $f12, ALPHA - fmadd.s $f13, $f13, BETA, $f12 - fst.s $f13, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 546f13558caad680fdd82778c106ebaa3bd7331c Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 29 Dec 2023 11:03:53 +0800 Subject: [PATCH 20/21] loongarch64: Add {c/z}swap and {c/z}sum optimization --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 + kernel/loongarch64/csum_lasx.S | 274 +++++++++++++++ kernel/loongarch64/csum_lsx.S | 266 ++++++++++++++ kernel/loongarch64/cswap_lasx.S | 394 +++++++++++++++++++++ kernel/loongarch64/cswap_lsx.S | 421 +++++++++++++++++++++++ 6 files changed, 1367 insertions(+) create mode 100644 kernel/loongarch64/csum_lasx.S create mode 100644 kernel/loongarch64/csum_lsx.S create mode 100644 kernel/loongarch64/cswap_lasx.S create mode 100644 kernel/loongarch64/cswap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 0fb0bb68f..01f8e4782 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lsx.S CNRM2KERNEL = cnrm2_lsx.S ZNRM2KERNEL = znrm2_lsx.S +CSWAPKERNEL = cswap_lsx.S +ZSWAPKERNEL = cswap_lsx.S + +CSUMKERNEL = csum_lsx.S +ZSUMKERNEL = csum_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 1a6a04532..a9e8abaf0 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lasx.S CNRM2KERNEL = cnrm2_lasx.S ZNRM2KERNEL = znrm2_lasx.S +CSWAPKERNEL = cswap_lasx.S +ZSWAPKERNEL = cswap_lasx.S + +CSUMKERNEL = csum_lasx.S +ZSUMKERNEL = csum_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S new file mode 100644 index 000000000..3e65f2c15 --- /dev/null +++ b/kernel/loongarch64/csum_lasx.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S new file mode 100644 index 000000000..8de8e27ca --- /dev/null +++ b/kernel/loongarch64/csum_lsx.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S new file mode 100644 index 000000000..d53773d5a --- /dev/null +++ b/kernel/loongarch64/cswap_lasx.S @@ -0,0 +1,394 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 4 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 6 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 4 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 6 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S new file mode 100644 index 000000000..62a869066 --- /dev/null +++ b/kernel/loongarch64/cswap_lsx.S @@ -0,0 +1,421 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From a5d0d21378f6da778b18c8f9c5fd4ad33d9b52d5 Mon Sep 17 00:00:00 2001 From: pengxu Date: Fri, 29 Dec 2023 15:10:01 +0800 Subject: [PATCH 21/21] loongarch64: Add zgemm and cgemm optimization --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 24 +- kernel/loongarch64/cgemm_kernel_2x2_lasx.S | 857 +++++++++++++++++++++ kernel/loongarch64/cgemm_kernel_2x2_lsx.S | 812 +++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_2_lasx.S | 193 +++++ kernel/loongarch64/cgemm_ncopy_2_lsx.S | 202 +++++ kernel/loongarch64/cgemm_tcopy_2_lasx.S | 218 ++++++ kernel/loongarch64/cgemm_tcopy_2_lsx.S | 218 ++++++ kernel/loongarch64/zgemm_kernel_2x2.S | 848 ++++++++++++++++++++ kernel/loongarch64/zgemm_kernel_2x2_lasx.S | 822 ++++++++++++++++++++ kernel/loongarch64/zgemm_ncopy_2_lasx.S | 196 +++++ kernel/loongarch64/zgemm_tcopy_2_lasx.S | 212 +++++ param.h | 8 +- 14 files changed, 4621 insertions(+), 5 deletions(-) create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lasx.S create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lasx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lasx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_2x2.S create mode 100644 kernel/loongarch64/zgemm_kernel_2x2_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_2_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_2_lasx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index e581e2e3e..b1426da79 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -144,6 +144,7 @@ static inline int WhereAmI(void){ #define XVCMPLT xvfcmp.clt.d #define XVMUL xvfmul.d #define XVMSUB xvfmsub.d +#define XVNMSUB xvfnmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -158,6 +159,7 @@ static inline int WhereAmI(void){ #define VCMPLT vfcmp.clt.d #define VMUL vfmul.d #define VMSUB vfmsub.d +#define VNMSUB vfnmsub.d #else @@ -198,6 +200,7 @@ static inline int WhereAmI(void){ #define XVCMPLT xvfcmp.clt.s #define XVMUL xvfmul.s #define XVMSUB xvfmsub.s +#define XVNMSUB xvfnmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -212,6 +215,7 @@ static inline int WhereAmI(void){ #define VCMPLT vfcmp.clt.s #define VMUL vfmul.s #define VMSUB vfmsub.s +#define VNMSUB vfnmsub.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 01f8e4782..c365e9a75 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -95,4 +95,16 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index a9e8abaf0..68360faaf 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -107,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S +ZGEMMONCOPY = zgemm_ncopy_2_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S -endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S new file mode 100644 index 000000000..e07f7dc64 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S @@ -0,0 +1,857 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFSUB +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFADD +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFADD +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFSUB +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvpermi.d U0, U0, 0xd8 + xvpermi.d U2, U2, 0xd8 + + xvst U0, TD, 0x00 + xvst U2, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S new file mode 100644 index 000000000..1cf4d87dc --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S @@ -0,0 +1,202 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + slli.d T0, TL, 0x01 + + srai.d I, N, 0x01 + beq I, ZERO, .L_N0 + +.L_J1: /* if (i > 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..e2245e412 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvst U0, S8, 0x00 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + xvst U2, S8, 0x00 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S8, 0x00 + vst $vr1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld $vr0, TS, 0x00 + vld $vr1, TS, 0x10 + + vst $vr0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst $vr1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, TS, 0x00 + + vst $vr0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S new file mode 100644 index 000000000..15c0fde8f --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U2, S8, 0x10 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + vst U1, S8, 0x00 + vst U3, S8, 0x10 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, TS, 0x00 + + vst U0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_kernel_2x2.S b/kernel/loongarch64/zgemm_kernel_2x2.S new file mode 100644 index 000000000..589d170c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_2x2.S @@ -0,0 +1,848 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 +#define c51 $f26 +#define c52 $f27 +#define c61 $f28 +#define c62 $f29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -88 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + ST $f23, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset1 + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TS, TS, 0x40 // a_offset + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + vld $vr0, TS, 0x00 + vst $vr0, TD, 0x00 + + + addi.d TS, TS, 0x10 // a_offset + addi.d TD, TD, 0x10 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..3fe17beef --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S @@ -0,0 +1,212 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U2, S8, 0x20 + + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + xvst U1, S8, 0x00 + xvst U3, S8, 0x20 + + add.d S8, S8, T0 + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S9, S9, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + + slli.d T0, M, 0x05 + add.d TD, TD, T0 + + xvst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, TS, 0x00 + + vst $vr0, S9, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index e4e242d5d..54760a82e 100644 --- a/param.h +++ b/param.h @@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256