diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index b2a396674..1e4fa7a9d 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -4,4 +4,55 @@ SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +SSCALKERNEL = sscal_lsx.S +DSCALKERNEL = dscal_lsx.S + +SAMAXKERNEL = samax_lsx.S +DAMAXKERNEL = damax_lsx.S + +SAMINKERNEL = samin_lsx.S +DAMINKERNEL = damin_lsx.S + +SMAXKERNEL = smax_lsx.S +DMAXKERNEL = dmax_lsx.S + +SMINKERNEL = smin_lsx.S +DMINKERNEL = dmin_lsx.S + +ISMAXKERNEL = ismax_lsx.S +IDMAXKERNEL = idmax_lsx.S + +ISMINKERNEL = ismin_lsx.S +IDMINKERNEL = idmin_lsx.S + +ISAMAXKERNEL = isamax_lsx.S +IDAMAXKERNEL = idamax_lsx.S + +ISAMINKERNEL = isamin_lsx.S +IDAMINKERNEL = idamin_lsx.S + +SCOPYKERNEL = scopy_lsx.S +DCOPYKERNEL = dcopy_lsx.S + +SSWAPKERNEL = sswap_lsx.S +DSWAPKERNEL = dswap_lsx.S + +SAXPYKERNEL = saxpy_lsx.S +DAXPYKERNEL = daxpy_lsx.S + +SAXPBYKERNEL = saxpby_lsx.S +DAXPBYKERNEL = daxpby_lsx.S + +SSUMKERNEL = ssum_lsx.S +DSUMKERNEL = dsum_lsx.S + +SASUMKERNEL = sasum_lsx.S +DASUMKERNEL = dasum_lsx.S + +SROTKERNEL = srot_lsx.S +DROTKERNEL = drot_lsx.S + +SNRM2KERNEL = snrm2_lsx.S +DNRM2KERNEL = dnrm2_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 020a82303..f00abcb32 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -4,6 +4,57 @@ SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +SSCALKERNEL = sscal_lasx.S +DSCALKERNEL = dscal_lasx.S + +SAMAXKERNEL = samax_lasx.S +DAMAXKERNEL = damax_lasx.S + +SAMINKERNEL = samin_lasx.S +DAMINKERNEL = damin_lasx.S + +SMAXKERNEL = smax_lasx.S +DMAXKERNEL = dmax_lasx.S + +SMINKERNEL = smin_lasx.S +DMINKERNEL = dmin_lasx.S + +ISMAXKERNEL = ismax_lasx.S +IDMAXKERNEL = idmax_lasx.S + +ISMINKERNEL = ismin_lasx.S +IDMINKERNEL = idmin_lasx.S + +ISAMAXKERNEL = isamax_lasx.S +IDAMAXKERNEL = idamax_lasx.S + +ISAMINKERNEL = isamin_lasx.S +IDAMINKERNEL = idamin_lasx.S + +SCOPYKERNEL = scopy_lasx.S +DCOPYKERNEL = dcopy_lasx.S + +SSWAPKERNEL = sswap_lasx.S +DSWAPKERNEL = dswap_lasx.S + +SAXPYKERNEL = saxpy_lasx.S +DAXPYKERNEL = daxpy_lasx.S + +SAXPBYKERNEL = saxpby_lasx.S +DAXPBYKERNEL = daxpby_lasx.S + +SSUMKERNEL = ssum_lasx.S +DSUMKERNEL = dsum_lasx.S + +SASUMKERNEL = sasum_lasx.S +DASUMKERNEL = dasum_lasx.S + +SROTKERNEL = srot_lasx.S +DROTKERNEL = drot_lasx.S + +SNRM2KERNEL = snrm2_lasx.S +DNRM2KERNEL = dnrm2_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S new file mode 100644 index 000000000..c44ce4995 --- /dev/null +++ b/kernel/loongarch64/damax_lasx.S @@ -0,0 +1,183 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define x5 $xr13 +#define x6 $xr14 +#define x7 $xr15 +#define x8 $xr16 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr18 +#define VM3 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvfmaxa.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmaxa.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfmaxa.d VM2, x3, x4 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4