From 154baad454647fdd6d71e2c907285859718da22e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 27 Dec 2023 16:04:33 +0800 Subject: [PATCH] loongarch64: Refine iamin optimization. --- common_loongarch64.h | 10 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- .../{isamin_lasx.S => iamin_lasx.S} | 270 +++++++---- kernel/loongarch64/iamin_lsx.S | 446 ++++++++++++++++++ kernel/loongarch64/idamin_lasx.S | 275 ----------- kernel/loongarch64/idamin_lsx.S | 228 --------- kernel/loongarch64/isamin_lsx.S | 275 ----------- 8 files changed, 649 insertions(+), 863 deletions(-) rename kernel/loongarch64/{isamin_lasx.S => iamin_lasx.S} (54%) create mode 100644 kernel/loongarch64/iamin_lsx.S delete mode 100644 kernel/loongarch64/idamin_lasx.S delete mode 100644 kernel/loongarch64/idamin_lsx.S delete mode 100644 kernel/loongarch64/isamin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 72e900f77..846fc0dbd 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -119,6 +119,7 @@ static inline int WhereAmI(void){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d #define FMIN fmin.d #define FMINA fmina.d @@ -136,6 +137,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.d #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLT xvfcmp.clt.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -144,6 +147,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.d #define VFMAX vfmax.d #define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLT vfcmp.clt.d #else @@ -159,6 +164,7 @@ static inline int WhereAmI(void){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s #define FMIN fmin.s #define FMINA fmina.s @@ -176,6 +182,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.s #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLT xvfcmp.clt.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -184,6 +192,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.s #define VFMAX vfmax.s #define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLT vfcmp.clt.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index cb230b348..4eae2e4f9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S -ISAMINKERNEL = isamin_lsx.S -IDAMINKERNEL = idamin_lsx.S +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index ba59c4566..e7e1b5d5a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S -ISAMINKERNEL = isamin_lasx.S -IDAMINKERNEL = idamin_lasx.S +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/iamin_lasx.S similarity index 54% rename from kernel/loongarch64/isamin_lasx.S rename to kernel/loongarch64/iamin_lasx.S index cbdf32530..6ea117907 100644 --- a/kernel/loongarch64/isamin_lasx.S +++ b/kernel/loongarch64/iamin_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,25 +128,49 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 addi.d I, I, -1 - xvadd.w VI1, VI1, VINC8 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 +#else + addi.d I, I, -1 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 +#endif + XVCMPEQ VT0, VM0, VM1 addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 +#else xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 + xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 + xvor.v VX1, VM0, VX1 xvpickve.w VI1, VI0, 0 xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 @@ -105,28 +181,62 @@ xvpickve.w x4, VM0, 3 xvfmina.s VM1, x1, x2 xvfcmp.ceq.s VT0, x1, VM1 +#endif xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMINA VM0, x4, x3 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMINA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX xvinsgr2vr.w VM0, t1, 0 srai.d I, N, 3 @@ -186,9 +296,43 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -213,73 +357,42 @@ xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - xvadd.w VI1, VI1, VINC8 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 xvfcmp.ceq.s VT0, VM1, VM0 +#endif addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L24 .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfmina.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else fmov.s $f16, $f20 .align 3 @@ -306,35 +419,28 @@ xvfmina.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -346,9 +452,11 @@ xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 +#endif + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 srai.d i1, N, 3 slli.d i1, i1, 3 addi.d i1, i1, 1 //current index @@ -357,17 +465,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmina.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX + XVFMINA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +483,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S new file mode 100644 index 000000000..ce885fd88 --- /dev/null +++ b/kernel/loongarch64/iamin_lsx.S @@ -0,0 +1,446 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#ifdef DOUBLE +#define VINC2 $vr17 +#define VINC4 $vr18 +#else +#define VINC4 $vr17 +#define VINC8 $vr18 +#endif +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 +#else + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v x2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 +#endif + VCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vreplvei.d VI1, VI0, 0 + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 +#endif + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + VFMINA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + MTC $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S deleted file mode 100644 index 6ef1e8903..000000000 --- a/kernel/loongarch64/idamin_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d I, I, -1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmina.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S deleted file mode 100644 index 9eb9d883f..000000000 --- a/kernel/loongarch64/idamin_lsx.S +++ /dev/null @@ -1,228 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmina.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - vfmina.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S deleted file mode 100644 index 598888660..000000000 --- a/kernel/loongarch64/isamin_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmina.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file