diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 879a6f68b..a94303151 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -37,4 +37,10 @@ DCOPYKERNEL = dcopy_lsx.S SSWAPKERNEL = sswap_lsx.S DSWAPKERNEL = dswap_lsx.S +SAXPYKERNEL = saxpy_lsx.S +DAXPYKERNEL = daxpy_lsx.S + +SAXPBYKERNEL = saxpby_lsx.S +DAXPBYKERNEL = daxpby_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 581cfdbbe..4cfd53058 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -37,6 +37,12 @@ DCOPYKERNEL = dcopy_lasx.S SSWAPKERNEL = sswap_lasx.S DSWAPKERNEL = dswap_lasx.S +SAXPYKERNEL = saxpy_lasx.S +DAXPYKERNEL = daxpy_lasx.S + +SAXPBYKERNEL = saxpby_lasx.S +DAXPBYKERNEL = daxpby_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/daxpby_lasx.S new file mode 100644 index 000000000..4b19703e7 --- /dev/null +++ b/kernel/loongarch64/daxpby_lasx.S @@ -0,0 +1,629 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + xvreplgr2vr.d VXA, t1 + movfr2gr.d t2, BETA + xvreplgr2vr.d VXB, t2 + movfr2gr.d t3, a1 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX2, VX2, VXB, VX0 + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + xvld VX1, X, 4 * SIZE + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + xvfmadd.d VX2, VX2, VXB, VX0 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX0, Y, 0 * SIZE + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.d $f12, $f12, ALPHA + fmadd.d $f13, $f13, BETA, $f12 + fst.d $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/daxpby_lsx.S new file mode 100644 index 000000000..9aafbaf2a --- /dev/null +++ b/kernel/loongarch64/daxpby_lsx.S @@ -0,0 +1,693 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + vreplgr2vr.d VXA, t1 + movfr2gr.d t2, BETA + vreplgr2vr.d VXB, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX2, VX2, VXA + vfmul.d VX3, VX3, VXA + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0\ + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXB + vfmul.d VX1, VX1, VXB + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vfmul.d VX0, VX0, VXA + vld VX1, X, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vld VX0, X, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + vld VX1, X, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + vld VX2, Y, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vld VX3, Y, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.d $f12, $f12, ALPHA + fmadd.d $f13, $f13, BETA, $f12 + fst.d $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/daxpy_lasx.S new file mode 100644 index 000000000..bafd871ab --- /dev/null +++ b/kernel/loongarch64/daxpy_lasx.S @@ -0,0 +1,338 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + xvreplgr2vr.d VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.d $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfadd.d VX2, VX0, VX2 + xvfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmadd.d VX2, VX0, VXA, VX2 + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX1, X, 4 * SIZE + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + b .L999 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/daxpy_lsx.S new file mode 100644 index 000000000..fc88f0bb9 --- /dev/null +++ b/kernel/loongarch64/daxpy_lsx.S @@ -0,0 +1,365 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + vreplgr2vr.d VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.d $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 2 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vld VX0, X, 4 * SIZE + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 6 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 2 * SIZE + vst VX2, Y, 0 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + vld VX2, Y, 4 * SIZE + vst VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 6 * SIZE + vst VX2, Y, 4 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S new file mode 100644 index 000000000..c5d1ff402 --- /dev/null +++ b/kernel/loongarch64/saxpby_lasx.S @@ -0,0 +1,597 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + xvreplgr2vr.w VXA, t1 + movfr2gr.s t2, BETA + xvreplgr2vr.w VXB, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.s $f12, $f12, ALPHA + fmadd.s $f13, $f13, BETA, $f12 + fst.s $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S new file mode 100644 index 000000000..7f8cea2dd --- /dev/null +++ b/kernel/loongarch64/saxpby_lsx.S @@ -0,0 +1,629 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + vreplgr2vr.w VXA, t1 + movfr2gr.s t2, BETA + vreplgr2vr.w VXB, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.s $f12, $f12, ALPHA + fmadd.s $f13, $f13, BETA, $f12 + fst.s $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S new file mode 100644 index 000000000..609e26328 --- /dev/null +++ b/kernel/loongarch64/saxpy_lasx.S @@ -0,0 +1,323 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + xvreplgr2vr.w VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.s $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S new file mode 100644 index 000000000..f47415ed6 --- /dev/null +++ b/kernel/loongarch64/saxpy_lsx.S @@ -0,0 +1,338 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + vreplgr2vr.w VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.s $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L113 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE