diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c365e9a75..e27ce3bee 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S CAMAXKERNEL = camax_lsx.S +ZAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S CAMINKERNEL = camin_lsx.S +ZAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 68360faaf..f4429cfba 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S CAMAXKERNEL = camax_lasx.S +ZAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S CAMINKERNEL = camin_lasx.S +ZAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S index 7013430cb..f9a4e9012 100644 --- a/kernel/loongarch64/camax_lasx.S +++ b/kernel/loongarch64/camax_lasx.S @@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmax.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMAX VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmax.s VM1, x1, x2 - xvfmax.s VM0, x3, x4 - xvfmax.s VM0, VM0, VM1 + XVFMAX VM0, x1, x2 + XVFMAX VM1, x3, x4 + XVFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S index 2e55629de..cf46cb016 100644 --- a/kernel/loongarch64/camax_lsx.S +++ b/kernel/loongarch64/camax_lsx.S @@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#endif addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmax.s VM1, x1, VM1 - vfmax.s VM0, VM0, VM1 + addi.d I, I, -1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMAX VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmax.s VM1, x1, x2 - vfmax.s VM0, x3, x4 - vfmax.s VM0, VM0, VM1 + VFMAX VM1, x1, x2 + VFMAX VM0, x3, x4 + VFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S index d7931d30a..c1c4c98c8 100644 --- a/kernel/loongarch64/camin_lasx.S +++ b/kernel/loongarch64/camin_lasx.S @@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 +#else xvreplve0.w VM0, VM0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMIN VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmin.s VM1, x1, x2 - xvfmin.s VM0, x3, x4 - xvfmin.s VM0, VM0, VM1 + XVFMIN VM0, x1, x2 + XVFMIN VM1, x3, x4 + XVFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S index e9ad6b04d..ff666ea8f 100644 --- a/kernel/loongarch64/camin_lsx.S +++ b/kernel/loongarch64/camin_lsx.S @@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + vreplvei.d VM0, VM0, 0 +#else vreplvei.w VM0, VM0, 0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#endif addi.d I, I, -1 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmin.s VM1, x1, VM1 - vfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMIN VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmin.s VM1, x1, x2 - vfmin.s VM0, x3, x4 - vfmin.s VM0, VM0, VM1 + VFMIN VM1, x1, x2 + VFMIN VM0, x3, x4 + VFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3