From 993ede7c70658870921b446b145310f1b6c1edf6 Mon Sep 17 00:00:00 2001 From: yancheng Date: Mon, 27 Nov 2023 11:30:34 +0800 Subject: [PATCH 01/15] loongarch64: Add optimizations for scal. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dscal_lasx.S | 194 +++++++++++++++++++++ kernel/loongarch64/dscal_lsx.S | 205 +++++++++++++++++++++++ kernel/loongarch64/sscal_lasx.S | 188 +++++++++++++++++++++ kernel/loongarch64/sscal_lsx.S | 194 +++++++++++++++++++++ 6 files changed, 787 insertions(+) create mode 100644 kernel/loongarch64/dscal_lasx.S create mode 100644 kernel/loongarch64/dscal_lsx.S create mode 100644 kernel/loongarch64/sscal_lasx.S create mode 100644 kernel/loongarch64/sscal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index b2a396674..e553c4b95 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -4,4 +4,7 @@ SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +SSCALKERNEL = sscal_lsx.S +DSCALKERNEL = dscal_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 020a82303..4c0c1c2c8 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -4,6 +4,9 @@ SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +SSCALKERNEL = sscal_lasx.S +DSCALKERNEL = dscal_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dscal_lasx.S b/kernel/loongarch64/dscal_lasx.S new file mode 100644 index 000000000..153662378 --- /dev/null +++ b/kernel/loongarch64/dscal_lasx.S @@ -0,0 +1,194 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.d $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + move XX, X + .align 3 + +.L10: + bge $r0, I, .L32 + .align 3 +.L11: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE + xvst VX0, X, 4 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA , TEMP + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + addi.d I, I, -1 + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.d a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.d a1, ALPHA, a1 + fst.d a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dscal_lsx.S b/kernel/loongarch64/dscal_lsx.S new file mode 100644 index 000000000..55f497752 --- /dev/null +++ b/kernel/loongarch64/dscal_lsx.S @@ -0,0 +1,205 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.d $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA , TEMP + .align 3 + +.L31: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.d a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.d a1, ALPHA, a1 + fst.d a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sscal_lasx.S b/kernel/loongarch64/sscal_lasx.S new file mode 100644 index 000000000..329f24659 --- /dev/null +++ b/kernel/loongarch64/sscal_lasx.S @@ -0,0 +1,188 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.s $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.s TEMP, ALPHA + xvreplgr2vr.w VALPHA, TEMP + move XX, X + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VT0, VX0, VALPHA + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.s TEMP, ALPHA + xvreplgr2vr.w VALPHA , TEMP + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvfmul.s VT0, VX0, VALPHA + xvst VT0, X, 0 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.s a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.s a1, ALPHA, a1 + fst.s a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sscal_lsx.S b/kernel/loongarch64/sscal_lsx.S new file mode 100644 index 000000000..d0ea1307d --- /dev/null +++ b/kernel/loongarch64/sscal_lsx.S @@ -0,0 +1,194 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.s $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.s TEMP, ALPHA + vreplgr2vr.w VALPHA, TEMP + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + vfmul.s VT0, VX0, VALPHA + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vfmul.s VT1, VX1, VALPHA + vstelm.w VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 3 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE + vst VX0, X, 4 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.s TEMP, ALPHA + vreplgr2vr.w VALPHA , TEMP + .align 3 + +.L31: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VT0, VX0, VALPHA + vfmul.s VT1, VX1, VALPHA + addi.d I, I, -1 + vst VT0, X, 0 * SIZE + vst VT1, X, 4 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.s a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.s a1, ALPHA, a1 + fst.s a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE From 265b5f2e803d2d66b7fe75ec356dc14fcb5ef726 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 10:57:13 +0800 Subject: [PATCH 02/15] loongarch64: Add optimizations for amax. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/damax_lasx.S | 183 ++++++++++++++++++++ kernel/loongarch64/damax_lsx.S | 145 ++++++++++++++++ kernel/loongarch64/samax_lasx.S | 208 +++++++++++++++++++++++ kernel/loongarch64/samax_lsx.S | 177 +++++++++++++++++++ 6 files changed, 719 insertions(+) create mode 100644 kernel/loongarch64/damax_lasx.S create mode 100644 kernel/loongarch64/damax_lsx.S create mode 100644 kernel/loongarch64/samax_lasx.S create mode 100644 kernel/loongarch64/samax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index e553c4b95..a4d5f8f87 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -7,4 +7,7 @@ DDOTKERNEL = dot_lsx.S SSCALKERNEL = sscal_lsx.S DSCALKERNEL = dscal_lsx.S +SAMAXKERNEL = samax_lsx.S +DAMAXKERNEL = damax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 4c0c1c2c8..8c7481ae6 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -7,6 +7,9 @@ DDOTKERNEL = dot_lasx.S SSCALKERNEL = sscal_lasx.S DSCALKERNEL = dscal_lasx.S +SAMAXKERNEL = samax_lasx.S +DAMAXKERNEL = damax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S new file mode 100644 index 000000000..c44ce4995 --- /dev/null +++ b/kernel/loongarch64/damax_lasx.S @@ -0,0 +1,183 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define x5 $xr13 +#define x6 $xr14 +#define x7 $xr15 +#define x8 $xr16 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr18 +#define VM3 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvfmaxa.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmaxa.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfmaxa.d VM2, x3, x4 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:08:09 +0800 Subject: [PATCH 03/15] loongarch64: Add optimization for amin. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/damin_lasx.S | 178 +++++++++++++++++++ kernel/loongarch64/damin_lsx.S | 145 ++++++++++++++++ kernel/loongarch64/samin_lasx.S | 208 +++++++++++++++++++++++ kernel/loongarch64/samin_lsx.S | 177 +++++++++++++++++++ 6 files changed, 714 insertions(+) create mode 100644 kernel/loongarch64/damin_lasx.S create mode 100644 kernel/loongarch64/damin_lsx.S create mode 100644 kernel/loongarch64/samin_lasx.S create mode 100644 kernel/loongarch64/samin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a4d5f8f87..279ff6a9c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -10,4 +10,7 @@ DSCALKERNEL = dscal_lsx.S SAMAXKERNEL = samax_lsx.S DAMAXKERNEL = damax_lsx.S +SAMINKERNEL = samin_lsx.S +DAMINKERNEL = damin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 8c7481ae6..83db79050 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -10,6 +10,9 @@ DSCALKERNEL = dscal_lasx.S SAMAXKERNEL = samax_lasx.S DAMAXKERNEL = damax_lasx.S +SAMINKERNEL = samin_lasx.S +DAMINKERNEL = damin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/damin_lasx.S b/kernel/loongarch64/damin_lasx.S new file mode 100644 index 000000000..9d96dd997 --- /dev/null +++ b/kernel/loongarch64/damin_lasx.S @@ -0,0 +1,178 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvld VX1, X, 4 * SIZE + xvfmina.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmina.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfmina.d VM2, x3, x4 + xvfmina.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:30:02 +0800 Subject: [PATCH 04/15] loongarch64: Add optimization for max. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dmax_lasx.S | 175 +++++++++++++++++++ kernel/loongarch64/dmax_lsx.S | 141 ++++++++++++++++ kernel/loongarch64/smax_lasx.S | 205 +++++++++++++++++++++++ kernel/loongarch64/smax_lsx.S | 171 +++++++++++++++++++ 6 files changed, 698 insertions(+) create mode 100644 kernel/loongarch64/dmax_lasx.S create mode 100644 kernel/loongarch64/dmax_lsx.S create mode 100644 kernel/loongarch64/smax_lasx.S create mode 100644 kernel/loongarch64/smax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 279ff6a9c..e00893b72 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -13,4 +13,7 @@ DAMAXKERNEL = damax_lsx.S SAMINKERNEL = samin_lsx.S DAMINKERNEL = damin_lsx.S +SMAXKERNEL = smax_lsx.S +DMAXKERNEL = dmax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 83db79050..f238436f5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -13,6 +13,9 @@ DAMAXKERNEL = damax_lasx.S SAMINKERNEL = samin_lasx.S DAMINKERNEL = damin_lasx.S +SMAXKERNEL = smax_lasx.S +DMAXKERNEL = dmax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dmax_lasx.S b/kernel/loongarch64/dmax_lasx.S new file mode 100644 index 000000000..46366d2ec --- /dev/null +++ b/kernel/loongarch64/dmax_lasx.S @@ -0,0 +1,175 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvfmax.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmax.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfmax.d VM2, x3, x4 + xvfmax.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:51:19 +0800 Subject: [PATCH 05/15] loongarch64: Add optimization for min. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dmin_lasx.S | 175 +++++++++++++++++++ kernel/loongarch64/dmin_lsx.S | 143 ++++++++++++++++ kernel/loongarch64/smin_lasx.S | 205 +++++++++++++++++++++++ kernel/loongarch64/smin_lsx.S | 174 +++++++++++++++++++ 6 files changed, 703 insertions(+) create mode 100644 kernel/loongarch64/dmin_lasx.S create mode 100644 kernel/loongarch64/dmin_lsx.S create mode 100644 kernel/loongarch64/smin_lasx.S create mode 100644 kernel/loongarch64/smin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index e00893b72..0ff73c2db 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -16,4 +16,7 @@ DAMINKERNEL = damin_lsx.S SMAXKERNEL = smax_lsx.S DMAXKERNEL = dmax_lsx.S +SMINKERNEL = smin_lsx.S +DMINKERNEL = dmin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f238436f5..71f53d9d7 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -16,6 +16,9 @@ DAMINKERNEL = damin_lasx.S SMAXKERNEL = smax_lasx.S DMAXKERNEL = dmax_lasx.S +SMINKERNEL = smin_lasx.S +DMINKERNEL = dmin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dmin_lasx.S b/kernel/loongarch64/dmin_lasx.S new file mode 100644 index 000000000..e76056565 --- /dev/null +++ b/kernel/loongarch64/dmin_lasx.S @@ -0,0 +1,175 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvfmin.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmin.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmin.d VM1, x1, x2 + xvfmin.d VM2, x3, x4 + xvfmin.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:56:41 +0800 Subject: [PATCH 06/15] loongarch64: Add optimizations for imax. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idmax_lasx.S | 273 +++++++++++++++++ kernel/loongarch64/idmax_lsx.S | 225 ++++++++++++++ kernel/loongarch64/ismax_lasx.S | 375 +++++++++++++++++++++++ kernel/loongarch64/ismax_lsx.S | 272 ++++++++++++++++ 6 files changed, 1151 insertions(+) create mode 100644 kernel/loongarch64/idmax_lasx.S create mode 100644 kernel/loongarch64/idmax_lsx.S create mode 100644 kernel/loongarch64/ismax_lasx.S create mode 100644 kernel/loongarch64/ismax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 0ff73c2db..3c201ad89 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -19,4 +19,7 @@ DMAXKERNEL = dmax_lsx.S SMINKERNEL = smin_lsx.S DMINKERNEL = dmin_lsx.S +ISMAXKERNEL = ismax_lsx.S +IDMAXKERNEL = idmax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 71f53d9d7..953793b9a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -19,6 +19,9 @@ DMAXKERNEL = dmax_lasx.S SMINKERNEL = smin_lasx.S DMINKERNEL = dmin_lasx.S +ISMAXKERNEL = ismax_lasx.S +IDMAXKERNEL = idmax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idmax_lasx.S b/kernel/loongarch64/idmax_lasx.S new file mode 100644 index 000000000..bbfe0941a --- /dev/null +++ b/kernel/loongarch64/idmax_lasx.S @@ -0,0 +1,273 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmax_lsx.S b/kernel/loongarch64/idmax_lsx.S new file mode 100644 index 000000000..1b4734bab --- /dev/null +++ b/kernel/loongarch64/idmax_lsx.S @@ -0,0 +1,225 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, VM0, x1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, VM0, x1 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfcmp.clt.d VT0, x1, x2 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismax_lasx.S b/kernel/loongarch64/ismax_lasx.S new file mode 100644 index 000000000..843dd6c6a --- /dev/null +++ b/kernel/loongarch64/ismax_lasx.S @@ -0,0 +1,375 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.s $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismax_lsx.S b/kernel/loongarch64/ismax_lsx.S new file mode 100644 index 000000000..33b326bbd --- /dev/null +++ b/kernel/loongarch64/ismax_lsx.S @@ -0,0 +1,272 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.s $fcc0, $f15, $f9 + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From e3fb2b5afa63fc0db472268adfd7038146ea6ac6 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:01:05 +0800 Subject: [PATCH 07/15] loongarch64: Add optimizations for imin. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idmin_lasx.S | 272 +++++++++++++++++ kernel/loongarch64/idmin_lsx.S | 225 ++++++++++++++ kernel/loongarch64/ismin_lasx.S | 374 +++++++++++++++++++++++ kernel/loongarch64/ismin_lsx.S | 271 ++++++++++++++++ 6 files changed, 1148 insertions(+) create mode 100644 kernel/loongarch64/idmin_lasx.S create mode 100644 kernel/loongarch64/idmin_lsx.S create mode 100644 kernel/loongarch64/ismin_lasx.S create mode 100644 kernel/loongarch64/ismin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 3c201ad89..d0eab3d7c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -22,4 +22,7 @@ DMINKERNEL = dmin_lsx.S ISMAXKERNEL = ismax_lsx.S IDMAXKERNEL = idmax_lsx.S +ISMINKERNEL = ismin_lsx.S +IDMINKERNEL = idmin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 953793b9a..92389001b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -22,6 +22,9 @@ DMINKERNEL = dmin_lasx.S ISMAXKERNEL = ismax_lasx.S IDMAXKERNEL = idmax_lasx.S +ISMINKERNEL = ismin_lasx.S +IDMINKERNEL = idmin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idmin_lasx.S b/kernel/loongarch64/idmin_lasx.S new file mode 100644 index 000000000..7930d4963 --- /dev/null +++ b/kernel/loongarch64/idmin_lasx.S @@ -0,0 +1,272 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmin_lsx.S b/kernel/loongarch64/idmin_lsx.S new file mode 100644 index 000000000..8b6edcbf0 --- /dev/null +++ b/kernel/loongarch64/idmin_lsx.S @@ -0,0 +1,225 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x3, x1 + addi.d X, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, x1, VM0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x3, x1 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, x1, VM0 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfcmp.clt.d VT0, x2, x1 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismin_lasx.S b/kernel/loongarch64/ismin_lasx.S new file mode 100644 index 000000000..15f6e2ec9 --- /dev/null +++ b/kernel/loongarch64/ismin_lasx.S @@ -0,0 +1,374 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, x1, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x2, x1 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, x1, VM0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f13, $f15 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.s $fcc0, $f9, $f15 + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + add.d X, X, INCX + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismin_lsx.S b/kernel/loongarch64/ismin_lsx.S new file mode 100644 index 000000000..f90ebbd57 --- /dev/null +++ b/kernel/loongarch64/ismin_lsx.S @@ -0,0 +1,271 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, x1, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, x1, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, x2, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, x3, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, x4, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + fcmp.clt.s $fcc0, $f9, $f15 + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From be83f5e4e08bfc5a7d94d5ed659368ca077c4374 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:07:30 +0800 Subject: [PATCH 08/15] loongarch64: Add optimizations for iamax. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idamax_lasx.S | 275 +++++++++++++++++ kernel/loongarch64/idamax_lsx.S | 267 ++++++++++++++++ kernel/loongarch64/isamax_lasx.S | 378 +++++++++++++++++++++++ kernel/loongarch64/isamax_lsx.S | 275 +++++++++++++++++ 6 files changed, 1201 insertions(+) create mode 100644 kernel/loongarch64/idamax_lasx.S create mode 100644 kernel/loongarch64/idamax_lsx.S create mode 100644 kernel/loongarch64/isamax_lasx.S create mode 100644 kernel/loongarch64/isamax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index d0eab3d7c..a4c8927ba 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -25,4 +25,7 @@ IDMAXKERNEL = idmax_lsx.S ISMINKERNEL = ismin_lsx.S IDMINKERNEL = idmin_lsx.S +ISAMAXKERNEL = isamax_lsx.S +IDAMAXKERNEL = idamax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 92389001b..835b027b8 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -25,6 +25,9 @@ IDMAXKERNEL = idmax_lasx.S ISMINKERNEL = ismin_lasx.S IDMINKERNEL = idmin_lasx.S +ISAMAXKERNEL = isamax_lasx.S +IDAMAXKERNEL = idamax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S new file mode 100644 index 000000000..8248ee757 --- /dev/null +++ b/kernel/loongarch64/idamax_lasx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + xvfmaxa.d VM1, x1, VM0 + xvfcmp.ceq.d VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S new file mode 100644 index 000000000..fb2d5bac1 --- /dev/null +++ b/kernel/loongarch64/idamax_lsx.S @@ -0,0 +1,267 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L17 + .align 3 + +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/isamax_lasx.S new file mode 100644 index 000000000..2800b1d43 --- /dev/null +++ b/kernel/loongarch64/isamax_lasx.S @@ -0,0 +1,378 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmaxa.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + xvfmaxa.s VM1, x1, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S new file mode 100644 index 000000000..a18aa7354 --- /dev/null +++ b/kernel/loongarch64/isamax_lsx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + vfmaxa.s VM1, x1, VM0 + vfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 49829b2b7db5d4ad78447c47391e8c0c96ffe803 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:11:30 +0800 Subject: [PATCH 09/15] loongarch64: Add optimizations for iamin. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idamin_lasx.S | 275 +++++++++++++++++ kernel/loongarch64/idamin_lsx.S | 228 ++++++++++++++ kernel/loongarch64/isamin_lasx.S | 378 +++++++++++++++++++++++ kernel/loongarch64/isamin_lsx.S | 275 +++++++++++++++++ 6 files changed, 1162 insertions(+) create mode 100644 kernel/loongarch64/idamin_lasx.S create mode 100644 kernel/loongarch64/idamin_lsx.S create mode 100644 kernel/loongarch64/isamin_lasx.S create mode 100644 kernel/loongarch64/isamin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a4c8927ba..bff52ce93 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -28,4 +28,7 @@ IDMINKERNEL = idmin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S +ISAMINKERNEL = isamin_lsx.S +IDAMINKERNEL = idamin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 835b027b8..a08598cc5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -28,6 +28,9 @@ IDMINKERNEL = idmin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S +ISAMINKERNEL = isamin_lasx.S +IDAMINKERNEL = idamin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S new file mode 100644 index 000000000..6ef1e8903 --- /dev/null +++ b/kernel/loongarch64/idamin_lasx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + xvfmina.d VM1, x1, VM0 + xvfcmp.ceq.d VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S new file mode 100644 index 000000000..9eb9d883f --- /dev/null +++ b/kernel/loongarch64/idamin_lsx.S @@ -0,0 +1,228 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + vfmina.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/isamin_lasx.S new file mode 100644 index 000000000..cbdf32530 --- /dev/null +++ b/kernel/loongarch64/isamin_lasx.S @@ -0,0 +1,378 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvadd.w VI1, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmina.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + xvfmina.s VM1, x1, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S new file mode 100644 index 000000000..598888660 --- /dev/null +++ b/kernel/loongarch64/isamin_lsx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + vfmina.s VM1, x1, VM0 + vfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From 174c25766b6b43e36a9ace6da26fe21f3a7305a9 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:15:46 +0800 Subject: [PATCH 10/15] loongarch64: Add optimizations for copy. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dcopy_lasx.S | 224 ++++++++++++++++++++++ kernel/loongarch64/dcopy_lsx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/scopy_lasx.S | 216 +++++++++++++++++++++ kernel/loongarch64/scopy_lsx.S | 220 +++++++++++++++++++++ 6 files changed, 898 insertions(+) create mode 100644 kernel/loongarch64/dcopy_lasx.S create mode 100644 kernel/loongarch64/dcopy_lsx.S create mode 100644 kernel/loongarch64/scopy_lasx.S create mode 100644 kernel/loongarch64/scopy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index bff52ce93..565bec0f2 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -31,4 +31,7 @@ IDAMAXKERNEL = idamax_lsx.S ISAMINKERNEL = isamin_lsx.S IDAMINKERNEL = idamin_lsx.S +SCOPYKERNEL = scopy_lsx.S +DCOPYKERNEL = dcopy_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index a08598cc5..a2443720b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -31,6 +31,9 @@ IDAMAXKERNEL = idamax_lasx.S ISAMINKERNEL = isamin_lasx.S IDAMINKERNEL = idamin_lasx.S +SCOPYKERNEL = scopy_lasx.S +DCOPYKERNEL = dcopy_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S new file mode 100644 index 000000000..9d7da4a80 --- /dev/null +++ b/kernel/loongarch64/dcopy_lasx.S @@ -0,0 +1,224 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S new file mode 100644 index 000000000..161655bbd --- /dev/null +++ b/kernel/loongarch64/dcopy_lsx.S @@ -0,0 +1,232 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + addi.d I, I, -1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX0, Y, 0 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX0, Y, 4 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S new file mode 100644 index 000000000..7db1e7cee --- /dev/null +++ b/kernel/loongarch64/scopy_lasx.S @@ -0,0 +1,216 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S new file mode 100644 index 000000000..32150d3d6 --- /dev/null +++ b/kernel/loongarch64/scopy_lsx.S @@ -0,0 +1,220 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 360acc0a41dd2ec3fc461bac390c74c958525ae8 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:57:05 +0800 Subject: [PATCH 11/15] loongarch64: Add optimizations for swap. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dswap_lasx.S | 301 +++++++++++++++++++++ kernel/loongarch64/dswap_lsx.S | 317 +++++++++++++++++++++++ kernel/loongarch64/sswap_lasx.S | 286 ++++++++++++++++++++ kernel/loongarch64/sswap_lsx.S | 294 +++++++++++++++++++++ 6 files changed, 1204 insertions(+) create mode 100644 kernel/loongarch64/dswap_lasx.S create mode 100644 kernel/loongarch64/dswap_lsx.S create mode 100644 kernel/loongarch64/sswap_lasx.S create mode 100644 kernel/loongarch64/sswap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 565bec0f2..879a6f68b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -34,4 +34,7 @@ IDAMINKERNEL = idamin_lsx.S SCOPYKERNEL = scopy_lsx.S DCOPYKERNEL = dcopy_lsx.S +SSWAPKERNEL = sswap_lsx.S +DSWAPKERNEL = dswap_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index a2443720b..581cfdbbe 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -34,6 +34,9 @@ IDAMINKERNEL = idamin_lasx.S SCOPYKERNEL = scopy_lasx.S DCOPYKERNEL = dcopy_lasx.S +SSWAPKERNEL = sswap_lasx.S +DSWAPKERNEL = dswap_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S new file mode 100644 index 000000000..221cb7fa2 --- /dev/null +++ b/kernel/loongarch64/dswap_lasx.S @@ -0,0 +1,301 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + addi.d I, I, -1 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + xvstelm.d VX2, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvstelm.d VX2, X, 0, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0, 0 + add.d X, X, INCY + ld.d t2, X, 0 * SIZE + xvstelm.d VX3, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvstelm.d VX3, X, 0, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + //addi.d TEMP, N, -1 + //mul.d TEMP, TEMP, INCX + //sub.d X, X, TEMP + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S new file mode 100644 index 000000000..7f7f585e1 --- /dev/null +++ b/kernel/loongarch64/dswap_lsx.S @@ -0,0 +1,317 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + addi.d I, I, -1 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCY + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCY + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + //addi.d TEMP, N, -1 + //mul.d TEMP, TEMP, INCX + //sub.d X, X, TEMP + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S new file mode 100644 index 000000000..7184eff45 --- /dev/null +++ b/kernel/loongarch64/sswap_lasx.S @@ -0,0 +1,286 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + xvstelm.w VX2, X, 0, 1 + add.d X, X, INCY + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvstelm.w VX2, X, 0, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0, 4 + add.d X, X, INCY + ld.w t2, X, 0 * SIZE + xvstelm.w VX2, X, 0, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0, 6 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvstelm.w VX2, X, 0, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S new file mode 100644 index 000000000..4f19a8024 --- /dev/null +++ b/kernel/loongarch64/sswap_lsx.S @@ -0,0 +1,294 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + addi.d I, I, -1 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + vstelm.w VX2, X, 0, 1 + add.d X, X, INCY + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vstelm.w VX2, X, 0, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0, 0 + add.d X, X, INCY + ld.w t2, X, 0 * SIZE + vstelm.w VX3, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vstelm.w VX3, X, 0, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From d4c96a35a80da33c9d71fc5dd291f05f55900e27 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:02:03 +0800 Subject: [PATCH 12/15] loongarch64: Add optimizations for axpy and axpby. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 + kernel/loongarch64/daxpby_lasx.S | 629 ++++++++++++++++++++ kernel/loongarch64/daxpby_lsx.S | 693 +++++++++++++++++++++++ kernel/loongarch64/daxpy_lasx.S | 338 +++++++++++ kernel/loongarch64/daxpy_lsx.S | 365 ++++++++++++ kernel/loongarch64/saxpby_lasx.S | 597 +++++++++++++++++++ kernel/loongarch64/saxpby_lsx.S | 629 ++++++++++++++++++++ kernel/loongarch64/saxpy_lasx.S | 323 +++++++++++ kernel/loongarch64/saxpy_lsx.S | 338 +++++++++++ 10 files changed, 3924 insertions(+) create mode 100644 kernel/loongarch64/daxpby_lasx.S create mode 100644 kernel/loongarch64/daxpby_lsx.S create mode 100644 kernel/loongarch64/daxpy_lasx.S create mode 100644 kernel/loongarch64/daxpy_lsx.S create mode 100644 kernel/loongarch64/saxpby_lasx.S create mode 100644 kernel/loongarch64/saxpby_lsx.S create mode 100644 kernel/loongarch64/saxpy_lasx.S create mode 100644 kernel/loongarch64/saxpy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 879a6f68b..a94303151 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -37,4 +37,10 @@ DCOPYKERNEL = dcopy_lsx.S SSWAPKERNEL = sswap_lsx.S DSWAPKERNEL = dswap_lsx.S +SAXPYKERNEL = saxpy_lsx.S +DAXPYKERNEL = daxpy_lsx.S + +SAXPBYKERNEL = saxpby_lsx.S +DAXPBYKERNEL = daxpby_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 581cfdbbe..4cfd53058 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -37,6 +37,12 @@ DCOPYKERNEL = dcopy_lasx.S SSWAPKERNEL = sswap_lasx.S DSWAPKERNEL = dswap_lasx.S +SAXPYKERNEL = saxpy_lasx.S +DAXPYKERNEL = daxpy_lasx.S + +SAXPBYKERNEL = saxpby_lasx.S +DAXPBYKERNEL = daxpby_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/daxpby_lasx.S new file mode 100644 index 000000000..4b19703e7 --- /dev/null +++ b/kernel/loongarch64/daxpby_lasx.S @@ -0,0 +1,629 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + xvreplgr2vr.d VXA, t1 + movfr2gr.d t2, BETA + xvreplgr2vr.d VXB, t2 + movfr2gr.d t3, a1 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX2, VX2, VXB, VX0 + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + xvld VX1, X, 4 * SIZE + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + xvfmadd.d VX2, VX2, VXB, VX0 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX0, Y, 0 * SIZE + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.d $f12, $f12, ALPHA + fmadd.d $f13, $f13, BETA, $f12 + fst.d $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/daxpby_lsx.S new file mode 100644 index 000000000..9aafbaf2a --- /dev/null +++ b/kernel/loongarch64/daxpby_lsx.S @@ -0,0 +1,693 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + vreplgr2vr.d VXA, t1 + movfr2gr.d t2, BETA + vreplgr2vr.d VXB, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX2, VX2, VXA + vfmul.d VX3, VX3, VXA + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0\ + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXB + vfmul.d VX1, VX1, VXB + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vfmul.d VX0, VX0, VXA + vld VX1, X, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vld VX0, X, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + vld VX1, X, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + vld VX2, Y, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vld VX3, Y, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.d $f12, $f12, ALPHA + fmadd.d $f13, $f13, BETA, $f12 + fst.d $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/daxpy_lasx.S new file mode 100644 index 000000000..bafd871ab --- /dev/null +++ b/kernel/loongarch64/daxpy_lasx.S @@ -0,0 +1,338 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + xvreplgr2vr.d VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.d $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfadd.d VX2, VX0, VX2 + xvfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmadd.d VX2, VX0, VXA, VX2 + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX1, X, 4 * SIZE + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + b .L999 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/daxpy_lsx.S new file mode 100644 index 000000000..fc88f0bb9 --- /dev/null +++ b/kernel/loongarch64/daxpy_lsx.S @@ -0,0 +1,365 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + vreplgr2vr.d VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.d $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 2 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vld VX0, X, 4 * SIZE + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 6 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 2 * SIZE + vst VX2, Y, 0 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + vld VX2, Y, 4 * SIZE + vst VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 6 * SIZE + vst VX2, Y, 4 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S new file mode 100644 index 000000000..c5d1ff402 --- /dev/null +++ b/kernel/loongarch64/saxpby_lasx.S @@ -0,0 +1,597 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + xvreplgr2vr.w VXA, t1 + movfr2gr.s t2, BETA + xvreplgr2vr.w VXB, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.s $f12, $f12, ALPHA + fmadd.s $f13, $f13, BETA, $f12 + fst.s $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S new file mode 100644 index 000000000..7f8cea2dd --- /dev/null +++ b/kernel/loongarch64/saxpby_lsx.S @@ -0,0 +1,629 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + vreplgr2vr.w VXA, t1 + movfr2gr.s t2, BETA + vreplgr2vr.w VXB, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.s $f12, $f12, ALPHA + fmadd.s $f13, $f13, BETA, $f12 + fst.s $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S new file mode 100644 index 000000000..609e26328 --- /dev/null +++ b/kernel/loongarch64/saxpy_lasx.S @@ -0,0 +1,323 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + xvreplgr2vr.w VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.s $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S new file mode 100644 index 000000000..f47415ed6 --- /dev/null +++ b/kernel/loongarch64/saxpy_lsx.S @@ -0,0 +1,338 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + vreplgr2vr.w VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.s $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L113 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From c80e7e27d1b70d6f9cf80bd64bde096d0c3359e3 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:08:03 +0800 Subject: [PATCH 13/15] loongarch64: Add optimizations for sum and asum. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 + kernel/loongarch64/dasum_lasx.S | 148 +++++++++++++++++++++ kernel/loongarch64/dasum_lsx.S | 158 +++++++++++++++++++++++ kernel/loongarch64/dsum_lasx.S | 125 ++++++++++++++++++ kernel/loongarch64/dsum_lsx.S | 123 ++++++++++++++++++ kernel/loongarch64/sasum_lasx.S | 157 ++++++++++++++++++++++ kernel/loongarch64/sasum_lsx.S | 148 +++++++++++++++++++++ kernel/loongarch64/ssum_lasx.S | 140 ++++++++++++++++++++ kernel/loongarch64/ssum_lsx.S | 125 ++++++++++++++++++ 10 files changed, 1136 insertions(+) create mode 100644 kernel/loongarch64/dasum_lasx.S create mode 100644 kernel/loongarch64/dasum_lsx.S create mode 100644 kernel/loongarch64/dsum_lasx.S create mode 100644 kernel/loongarch64/dsum_lsx.S create mode 100644 kernel/loongarch64/sasum_lasx.S create mode 100644 kernel/loongarch64/sasum_lsx.S create mode 100644 kernel/loongarch64/ssum_lasx.S create mode 100644 kernel/loongarch64/ssum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a94303151..5e83e67a4 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -43,4 +43,10 @@ DAXPYKERNEL = daxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S +SSUMKERNEL = ssum_lsx.S +DSUMKERNEL = dsum_lsx.S + +SASUMKERNEL = sasum_lsx.S +DASUMKERNEL = dasum_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 4cfd53058..2cc2edb74 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -43,6 +43,12 @@ DAXPYKERNEL = daxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S +SSUMKERNEL = ssum_lasx.S +DSUMKERNEL = dsum_lasx.S + +SASUMKERNEL = sasum_lasx.S +DASUMKERNEL = dasum_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dasum_lasx.S b/kernel/loongarch64/dasum_lasx.S new file mode 100644 index 000000000..49de98c40 --- /dev/null +++ b/kernel/loongarch64/dasum_lasx.S @@ -0,0 +1,148 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dasum_lsx.S b/kernel/loongarch64/dasum_lsx.S new file mode 100644 index 000000000..94750815e --- /dev/null +++ b/kernel/loongarch64/dasum_lsx.S @@ -0,0 +1,158 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S new file mode 100644 index 000000000..3c51dab60 --- /dev/null +++ b/kernel/loongarch64/dsum_lasx.S @@ -0,0 +1,125 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fadd.d $f16, $f12, $f16 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S new file mode 100644 index 000000000..402d087df --- /dev/null +++ b/kernel/loongarch64/dsum_lsx.S @@ -0,0 +1,123 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sasum_lasx.S b/kernel/loongarch64/sasum_lasx.S new file mode 100644 index 000000000..a452701aa --- /dev/null +++ b/kernel/loongarch64/sasum_lasx.S @@ -0,0 +1,157 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lsx.S b/kernel/loongarch64/sasum_lsx.S new file mode 100644 index 000000000..87026a144 --- /dev/null +++ b/kernel/loongarch64/sasum_lsx.S @@ -0,0 +1,148 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S new file mode 100644 index 000000000..7cf57bc77 --- /dev/null +++ b/kernel/loongarch64/ssum_lasx.S @@ -0,0 +1,140 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvfadd.s res1, VX0, res1 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfadd.s res1, VX0, res1 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S new file mode 100644 index 000000000..de63c69e3 --- /dev/null +++ b/kernel/loongarch64/ssum_lsx.S @@ -0,0 +1,125 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From f9b468990e2ca2af5ea3f5848dc23152aa0da5bb Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:12:29 +0800 Subject: [PATCH 14/15] loongarch64: Add optimizations for rot. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/drot_lasx.S | 927 +++++++++++++++++++ kernel/loongarch64/drot_lsx.S | 1050 ++++++++++++++++++++++ kernel/loongarch64/srot_lasx.S | 863 ++++++++++++++++++ kernel/loongarch64/srot_lsx.S | 927 +++++++++++++++++++ 6 files changed, 3773 insertions(+) create mode 100644 kernel/loongarch64/drot_lasx.S create mode 100644 kernel/loongarch64/drot_lsx.S create mode 100644 kernel/loongarch64/srot_lasx.S create mode 100644 kernel/loongarch64/srot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 5e83e67a4..026ea0d77 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,4 +49,7 @@ DSUMKERNEL = dsum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S +SROTKERNEL = srot_lsx.S +DROTKERNEL = drot_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 2cc2edb74..4905a50a9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,6 +49,9 @@ DSUMKERNEL = dsum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S +SROTKERNEL = srot_lasx.S +DROTKERNEL = drot_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/drot_lasx.S b/kernel/loongarch64/drot_lasx.S new file mode 100644 index 000000000..d3644b780 --- /dev/null +++ b/kernel/loongarch64/drot_lasx.S @@ -0,0 +1,927 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, C + xvreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + xvreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, X, 4 * SIZE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VXS, VX0 + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX2, VXC + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX3, VXS + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXS, VX2 + xvfmul.d VT1, VXS, VX0 + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + fmul.d $f10, $f12, C + fmadd.d $f10, $f13, S, $f10 + fst.d $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.d $f20, $f12, S + fmsub.d $f20, $f13, C, $f20 + fst.d $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/drot_lsx.S b/kernel/loongarch64/drot_lsx.S new file mode 100644 index 000000000..6db803b1c --- /dev/null +++ b/kernel/loongarch64/drot_lsx.S @@ -0,0 +1,1050 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, C + vreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + vreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, X, 6 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VXS, VX0 + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VXS, VX0 + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + vst VT1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VXS, VX2 + vfmul.d VT1, VXS, VX0 + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VXS, VX2 + vfmul.d VT1, VXS, VX0 + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + fmul.d $f10, $f12, C + fmadd.d $f10, $f13, S, $f10 + fst.d $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.d $f20, $f12, S + fmsub.d $f20, $f13, C, $f20 + fst.d $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/srot_lasx.S b/kernel/loongarch64/srot_lasx.S new file mode 100644 index 000000000..9aeb4dcf5 --- /dev/null +++ b/kernel/loongarch64/srot_lasx.S @@ -0,0 +1,863 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, C + xvreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + xvreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXS, VX2 + xvfmul.s VT1, VXS, VX0 + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + fmul.s $f10, $f12, C + fmadd.s $f10, $f13, S, $f10 + fst.s $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.s $f20, $f12, S + fmsub.s $f20, $f13, C, $f20 + fst.s $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/srot_lsx.S b/kernel/loongarch64/srot_lsx.S new file mode 100644 index 000000000..8822b58e4 --- /dev/null +++ b/kernel/loongarch64/srot_lsx.S @@ -0,0 +1,927 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, C + vreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + vreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VT0, VX0, VXC + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX1, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VT0, VX0, VXC + vfmul.s VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VT0, VX2, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX1, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX1, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmul.s VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX2, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX1, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VXC, VX0 + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VXS, VX0 + vfmsub.s VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX1, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VXC, VX0 + vfmul.s VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXS + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VXS, VX2 + vfmul.s VT1, VXS, VX0 + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX1, VXS + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmul.s VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX2, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + fmul.s $f10, $f12, C + fmadd.s $f10, $f13, S, $f10 + fst.s $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.s $f20, $f12, S + fmsub.s $f20, $f13, C, $f20 + fst.s $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From d32f38fb37c94afae20a662182fe7178c005a673 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:15:55 +0800 Subject: [PATCH 15/15] loongarch64: Add optimizations for nrm2. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dnrm2_lasx.S | 233 ++++++++++++++++++++++ kernel/loongarch64/dnrm2_lsx.S | 242 +++++++++++++++++++++++ kernel/loongarch64/snrm2_lasx.S | 143 ++++++++++++++ kernel/loongarch64/snrm2_lsx.S | 156 +++++++++++++++ 6 files changed, 780 insertions(+) create mode 100644 kernel/loongarch64/dnrm2_lasx.S create mode 100644 kernel/loongarch64/dnrm2_lsx.S create mode 100644 kernel/loongarch64/snrm2_lasx.S create mode 100644 kernel/loongarch64/snrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 026ea0d77..1e4fa7a9d 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -52,4 +52,7 @@ DASUMKERNEL = dasum_lsx.S SROTKERNEL = srot_lsx.S DROTKERNEL = drot_lsx.S +SNRM2KERNEL = snrm2_lsx.S +DNRM2KERNEL = dnrm2_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 4905a50a9..f00abcb32 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -52,6 +52,9 @@ DASUMKERNEL = dasum_lasx.S SROTKERNEL = srot_lasx.S DROTKERNEL = drot_lasx.S +SNRM2KERNEL = snrm2_lasx.S +DNRM2KERNEL = dnrm2_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S new file mode 100644 index 000000000..2a9c3cf7b --- /dev/null +++ b/kernel/loongarch64/dnrm2_lasx.S @@ -0,0 +1,233 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + move TEMP, X // initialize the maxa value + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L97 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t3, 2 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t3, 2 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 7 + bge $r0, I, .L99 + .align 3 + +.L98: + xvld VX1, X, 0 + xvfmaxa.d VM0, VM0, VX1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, SIZE + bne INCX, TEMP, .L120 + srai.d I, N, 3 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM0, VX0, VALPHA + xvfmul.d VM1, VX1, VALPHA + xvfmadd.d res1, VM0, VM0, res1 + xvfmadd.d res2, VM1, VM1, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + srai.d I, N, 3 + bge $r0, I, .L997 + +.L121: + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM0, VX0, VALPHA + xvfmul.d VM1, VX1, VALPHA + xvfmadd.d res1, VM0, VM0, res1 + xvfmadd.d res2, VM1, VM1, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM0, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM0 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f15, XX, 0 * SIZE + addi.d I, I, -1 + fmul.d $f15, $f15, ALPHA + fmadd.d $f19, $f15, $f15, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + fmov.d $f0, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S new file mode 100644 index 000000000..e4615e18d --- /dev/null +++ b/kernel/loongarch64/dnrm2_lsx.S @@ -0,0 +1,242 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + vld VM0, X, 0 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + move TEMP, X // initialize the maxa value + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L97 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 7 + bge $r0, I, .L99 + .align 3 + +.L98: + vld VX1, X, 0 + vfmaxa.d VM0, VM0, VX1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, SIZE + bne INCX, TEMP, .L120 + srai.d I, N, 3 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM0, VX0, VALPHA + vfmul.d VM1, VX1, VALPHA + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM0, VX0, VALPHA + vfmul.d VM1, VX1, VALPHA + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + srai.d I, N, 3 + bge $r0, I, .L997 + +.L121: + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM0, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + vfmul.d VM1, VX1, VALPHA + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM0, VX0, VALPHA + vfmul.d VM1, VX1, VALPHA + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f15, XX, 0 * SIZE + addi.d I, I, -1 + fmul.d $f15, $f15, ALPHA + fmadd.d $f19, $f15, $f15, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + fmov.d $f0, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S new file mode 100644 index 000000000..274908c14 --- /dev/null +++ b/kernel/loongarch64/snrm2_lasx.S @@ -0,0 +1,143 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 0 * SIZE + xvfcvtl.d.s VX0, VX0 + xvfcvth.d.s VX1, VX1 + xvfmadd.d res1, VX0, VX0, res1 + xvfmadd.d res2, VX1, VX1, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfcvtl.d.s VX0, VX0 + xvfcvth.d.s VX1, VX1 + xvfmadd.d res1, VX0, VX0, res1 + xvfmadd.d res2, VX1, VX1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f15, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s $f15, $f15 + fmadd.d $f19, $f15, $f15, $f19 + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S new file mode 100644 index 000000000..17d017900 --- /dev/null +++ b/kernel/loongarch64/snrm2_lsx.S @@ -0,0 +1,156 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 0 * SIZE + vfcvtl.d.s VX0, VX0 + vfcvth.d.s VX1, VX1 + vfmadd.d res1, VX0, VX0, res1 + vfmadd.d res2, VX1, VX1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 4 * SIZE + vfcvtl.d.s VX2, VX2 + vfcvth.d.s VX3, VX3 + vfmadd.d res1, VX2, VX2, res1 + vfmadd.d res2, VX3, VX3, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfcvtl.d.s VX0, VX0 + vfcvth.d.s VX1, VX1 + vfmadd.d res1, VX0, VX0, res1 + vfmadd.d res2, VX1, VX1, res2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfcvtl.d.s VX2, VX2 + vfcvth.d.s VX3, VX3 + vfmadd.d res1, VX2, VX2, res1 + vfmadd.d res2, VX3, VX3, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f15, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s $f15, $f15 + fmadd.d $f19, $f15, $f15, $f19 + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE