diff --git a/common_loongarch64.h b/common_loongarch64.h index 13514d6e0..599b4795c 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -139,6 +139,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d #define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d #define XVCMPLT xvfcmp.clt.d #define XVMUL xvfmul.d #define XVMSUB xvfmsub.d @@ -151,6 +152,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.d #define VFMAXA vfmaxa.d #define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d #define VCMPLT vfcmp.clt.d #define VMUL vfmul.d #define VMSUB vfmsub.d @@ -189,6 +191,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s #define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s #define XVCMPLT xvfcmp.clt.s #define XVMUL xvfmul.s #define XVMSUB xvfmsub.s @@ -201,6 +204,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.s #define VFMAXA vfmaxa.s #define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s #define VCMPLT vfcmp.clt.s #define VMUL vfmul.s #define VMSUB vfmsub.s diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index b315c81f2..a8a6dd82f 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lsx.S ISMINKERNEL = imin_lsx.S IDMINKERNEL = imin_lsx.S -ISAMAXKERNEL = isamax_lsx.S -IDAMAXKERNEL = idamax_lsx.S +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S ISAMINKERNEL = iamin_lsx.S IDAMINKERNEL = iamin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 577f6316e..e4c45e1fa 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lasx.S ISMINKERNEL = imin_lasx.S IDMINKERNEL = imin_lasx.S -ISAMAXKERNEL = isamax_lasx.S -IDAMAXKERNEL = idamax_lasx.S +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S ISAMINKERNEL = iamin_lasx.S IDAMINKERNEL = iamin_lasx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..7013430cb --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmax.s VM1, x1, x2 + xvfmax.s VM0, x3, x4 + xvfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..2e55629de --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmax.s VM1, x1, VM1 + vfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmax.s VM1, x1, x2 + vfmax.s VM0, x3, x4 + vfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/iamax_lasx.S similarity index 55% rename from kernel/loongarch64/isamax_lasx.S rename to kernel/loongarch64/iamax_lasx.S index 2800b1d43..090da3004 100644 --- a/kernel/loongarch64/isamax_lasx.S +++ b/kernel/loongarch64/iamax_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,9 +128,25 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else xvld VX0, X, 0 * SIZE addi.d I, I, -1 xvadd.w VI1, VI1, VINC8 @@ -87,10 +155,21 @@ addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -103,28 +182,62 @@ xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 +#endif + XVFMAXA VM1, x1, x2 + XVCMPEQ VT0, x1, VM1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMAXA VM0, x3, x4 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -186,9 +299,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -219,10 +369,30 @@ addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -245,44 +415,45 @@ xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 +#endif + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifdef DOUBLE + +#else .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -306,35 +477,27 @@ xvfmaxa.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 - .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -345,8 +508,9 @@ xvfcmp.ceq.s VT0, VM0, VX0 xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 +#endif -.L21: // N<8 +.L21: //N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -357,17 +521,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmaxa.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +539,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S new file mode 100644 index 000000000..ce5b3c724 --- /dev/null +++ b/kernel/loongarch64/iamax_lsx.S @@ -0,0 +1,482 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L17 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +#ifdef DOUBLE +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +#else +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + VFMAXA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S new file mode 100644 index 000000000..7800cb917 --- /dev/null +++ b/kernel/loongarch64/icamax_lasx.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + xvxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmax.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmax.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d I, I, -1 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.s x1, x1, x2 + xvfmax.s x3, VM0, x1 + xvfcmp.ceq.s VT0, x3, VM0 +#endif + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +#endif +.L21: //N<8 +#ifdef DOUBLE + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S new file mode 100644 index 000000000..a2fc9dbbd --- /dev/null +++ b/kernel/loongarch64/icamax_lsx.S @@ -0,0 +1,434 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + vxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 +#endif + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMAX x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + addi.d I, I, -1 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 + vfcmp.clt.s VT0, x1, VI3 + vfcmp.clt.s VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.s x1, x1, x2 + vfmax.s x3, VM0, x1 + vfcmp.ceq.s VT0, x3, VM0 +#endif + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +#ifdef DOUBLE +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 +#else +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S deleted file mode 100644 index 8248ee757..000000000 --- a/kernel/loongarch64/idamax_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmaxa.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S deleted file mode 100644 index fb2d5bac1..000000000 --- a/kernel/loongarch64/idamax_lsx.S +++ /dev/null @@ -1,267 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L11 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L16 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L17 - .align 3 - -.L16: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L17: - movfr2gr.d i0, $f20 - .align 3 - -.L11: //INCX==1 and N<8 - andi I, N, 7 - bge $r0, I, .L14 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L13: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - addi.d X, X, SIZE - movgr2fr.d $f21, i1 - blt $r0, I, .L13 - movfr2gr.d i0, $f20 - .align 3 - -.L14: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S deleted file mode 100644 index a18aa7354..000000000 --- a/kernel/loongarch64/isamax_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmaxa.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE