diff --git a/common_loongarch64.h b/common_loongarch64.h index 4963b2f07..b1426da79 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -119,19 +119,47 @@ static inline int WhereAmI(void){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d +#define FMIN fmin.d +#define FMINA fmina.d +#define FMAX fmax.d +#define FMAXA fmaxa.d #define CMPEQ fcmp.ceq.d #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d +#define FFINT ffint.d.l #define XVFSUB xvfsub.d #define XVFADD xvfadd.d +#define XVFMUL xvfmul.d #define XVFMADD xvfmadd.d +#define XVFMIN xvfmin.d +#define XVFMINA xvfmina.d +#define XVFMAX xvfmax.d +#define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d +#define XVCMPLT xvfcmp.clt.d +#define XVMUL xvfmul.d +#define XVMSUB xvfmsub.d +#define XVNMSUB xvfnmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d +#define VFMUL vfmul.d #define VFMADD vfmadd.d +#define VFMIN vfmin.d +#define VFMINA vfmina.d +#define VFMAX vfmax.d +#define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d +#define VCMPLT vfcmp.clt.d +#define VMUL vfmul.d +#define VMSUB vfmsub.d +#define VNMSUB vfnmsub.d #else @@ -147,19 +175,47 @@ static inline int WhereAmI(void){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s +#define FMIN fmin.s +#define FMINA fmina.s +#define FMAX fmax.s +#define FMAXA fmaxa.s #define CMPEQ fcmp.ceq.s #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s +#define FFINT ffint.s.l #define XVFSUB xvfsub.s #define XVFADD xvfadd.s +#define XVFMUL xvfmul.s #define XVFMADD xvfmadd.s +#define XVFMIN xvfmin.s +#define XVFMINA xvfmina.s +#define XVFMAX xvfmax.s +#define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s +#define XVCMPLT xvfcmp.clt.s +#define XVMUL xvfmul.s +#define XVMSUB xvfmsub.s +#define XVNMSUB xvfnmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s +#define VFMUL vfmul.s #define VFMADD vfmadd.s +#define VFMIN vfmin.s +#define VFMINA vfmina.s +#define VFMAX vfmax.s +#define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s +#define VCMPLT vfcmp.clt.s +#define VMUL vfmul.s +#define VMSUB vfmsub.s +#define VNMSUB vfnmsub.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 1e4fa7a9d..c365e9a75 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -3,56 +3,108 @@ ifndef NO_LSX SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +CDOTKERNEL = cdot_lsx.S +ZDOTKERNEL = cdot_lsx.S -SSCALKERNEL = sscal_lsx.S -DSCALKERNEL = dscal_lsx.S +SSCALKERNEL = scal_lsx.S +DSCALKERNEL = scal_lsx.S +CSCALKERNEL = cscal_lsx.S +ZSCALKERNEL = cscal_lsx.S -SAMAXKERNEL = samax_lsx.S -DAMAXKERNEL = damax_lsx.S +SAMAXKERNEL = amax_lsx.S +DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S -SAMINKERNEL = samin_lsx.S -DAMINKERNEL = damin_lsx.S +SAMINKERNEL = amin_lsx.S +DAMINKERNEL = amin_lsx.S +CAMINKERNEL = camin_lsx.S -SMAXKERNEL = smax_lsx.S -DMAXKERNEL = dmax_lsx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lsx.S -DMINKERNEL = dmin_lsx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lsx.S -IDMAXKERNEL = idmax_lsx.S +ISMAXKERNEL = imax_lsx.S +IDMAXKERNEL = imax_lsx.S -ISMINKERNEL = ismin_lsx.S -IDMINKERNEL = idmin_lsx.S +ISMINKERNEL = imin_lsx.S +IDMINKERNEL = imin_lsx.S -ISAMAXKERNEL = isamax_lsx.S -IDAMAXKERNEL = idamax_lsx.S +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S -ISAMINKERNEL = isamin_lsx.S -IDAMINKERNEL = idamin_lsx.S +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S +ICAMINKERNEL = icamin_lsx.S +IZAMINKERNEL = icamin_lsx.S -SCOPYKERNEL = scopy_lsx.S -DCOPYKERNEL = dcopy_lsx.S +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S +CCOPYKERNEL = ccopy_lsx.S +ZCOPYKERNEL = ccopy_lsx.S -SSWAPKERNEL = sswap_lsx.S -DSWAPKERNEL = dswap_lsx.S +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S -SAXPYKERNEL = saxpy_lsx.S -DAXPYKERNEL = daxpy_lsx.S +SAXPYKERNEL = axpy_lsx.S +DAXPYKERNEL = axpy_lsx.S +CAXPYKERNEL = caxpy_lsx.S +ZAXPYKERNEL = caxpy_lsx.S -SAXPBYKERNEL = saxpby_lsx.S -DAXPBYKERNEL = daxpby_lsx.S +SAXPBYKERNEL = axpby_lsx.S +DAXPBYKERNEL = axpby_lsx.S -SSUMKERNEL = ssum_lsx.S -DSUMKERNEL = dsum_lsx.S +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S -SASUMKERNEL = sasum_lsx.S -DASUMKERNEL = dasum_lsx.S +SASUMKERNEL = asum_lsx.S +DASUMKERNEL = asum_lsx.S +CASUMKERNEL = casum_lsx.S +ZASUMKERNEL = casum_lsx.S -SROTKERNEL = srot_lsx.S -DROTKERNEL = drot_lsx.S +SROTKERNEL = rot_lsx.S +DROTKERNEL = rot_lsx.S +CROTKERNEL = crot_lsx.S +ZROTKERNEL = crot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S + +CSWAPKERNEL = cswap_lsx.S +ZSWAPKERNEL = cswap_lsx.S + +CSUMKERNEL = csum_lsx.S +ZSUMKERNEL = csum_lsx.S + +DGEMMKERNEL = dgemm_kernel_8x4.S +DGEMMINCOPY = dgemm_ncopy_8_lsx.S +DGEMMITCOPY = dgemm_tcopy_8_lsx.S +DGEMMONCOPY = dgemm_ncopy_4_lsx.S +DGEMMOTCOPY = dgemm_tcopy_4_lsx.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f00abcb32..68360faaf 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -3,57 +3,83 @@ ifndef NO_LASX SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +CDOTKERNEL = cdot_lasx.S +ZDOTKERNEL = cdot_lasx.S -SSCALKERNEL = sscal_lasx.S -DSCALKERNEL = dscal_lasx.S +SSCALKERNEL = scal_lasx.S +DSCALKERNEL = scal_lasx.S +CSCALKERNEL = cscal_lasx.S +ZSCALKERNEL = cscal_lasx.S -SAMAXKERNEL = samax_lasx.S -DAMAXKERNEL = damax_lasx.S +SAMAXKERNEL = amax_lasx.S +DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S -SAMINKERNEL = samin_lasx.S -DAMINKERNEL = damin_lasx.S +SAMINKERNEL = amin_lasx.S +DAMINKERNEL = amin_lasx.S +CAMINKERNEL = camin_lasx.S -SMAXKERNEL = smax_lasx.S -DMAXKERNEL = dmax_lasx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lasx.S -DMINKERNEL = dmin_lasx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lasx.S -IDMAXKERNEL = idmax_lasx.S +ISMAXKERNEL = imax_lasx.S +IDMAXKERNEL = imax_lasx.S -ISMINKERNEL = ismin_lasx.S -IDMINKERNEL = idmin_lasx.S +ISMINKERNEL = imin_lasx.S +IDMINKERNEL = imin_lasx.S -ISAMAXKERNEL = isamax_lasx.S -IDAMAXKERNEL = idamax_lasx.S +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S -ISAMINKERNEL = isamin_lasx.S -IDAMINKERNEL = idamin_lasx.S +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S +ICAMINKERNEL = icamin_lasx.S +IZAMINKERNEL = icamin_lasx.S -SCOPYKERNEL = scopy_lasx.S -DCOPYKERNEL = dcopy_lasx.S +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S +CCOPYKERNEL = ccopy_lasx.S +ZCOPYKERNEL = ccopy_lasx.S -SSWAPKERNEL = sswap_lasx.S -DSWAPKERNEL = dswap_lasx.S +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S -SAXPYKERNEL = saxpy_lasx.S -DAXPYKERNEL = daxpy_lasx.S +SAXPYKERNEL = axpy_lasx.S +DAXPYKERNEL = axpy_lasx.S +CAXPYKERNEL = caxpy_lasx.S +ZAXPYKERNEL = caxpy_lasx.S -SAXPBYKERNEL = saxpby_lasx.S -DAXPBYKERNEL = daxpby_lasx.S +SAXPBYKERNEL = axpby_lasx.S +DAXPBYKERNEL = axpby_lasx.S -SSUMKERNEL = ssum_lasx.S -DSUMKERNEL = dsum_lasx.S +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S -SASUMKERNEL = sasum_lasx.S -DASUMKERNEL = dasum_lasx.S +SASUMKERNEL = asum_lasx.S +DASUMKERNEL = asum_lasx.S +CASUMKERNEL = casum_lasx.S +ZASUMKERNEL = casum_lasx.S -SROTKERNEL = srot_lasx.S -DROTKERNEL = drot_lasx.S +SROTKERNEL = rot_lasx.S +DROTKERNEL = rot_lasx.S +CROTKERNEL = crot_lasx.S +ZROTKERNEL = crot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S + +CSWAPKERNEL = cswap_lasx.S +ZSWAPKERNEL = cswap_lasx.S + +CSUMKERNEL = csum_lasx.S +ZSUMKERNEL = csum_lasx.S DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S @@ -81,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S +ZGEMMONCOPY = zgemm_ncopy_2_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S -endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S new file mode 100644 index 000000000..e964d4ddb --- /dev/null +++ b/kernel/loongarch64/amax_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM0, VM1 + XVFMAXA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S new file mode 100644 index 000000000..fb3b77a0e --- /dev/null +++ b/kernel/loongarch64/amax_lsx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM0, VM1 + VFMAXA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAXA VM1, VX0, VX1 + VFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S new file mode 100644 index 000000000..0a4359002 --- /dev/null +++ b/kernel/loongarch64/amin_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM0, VM1 + XVFMINA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMINA VM1, VX0, VX1 + XVFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S new file mode 100644 index 000000000..644caf43c --- /dev/null +++ b/kernel/loongarch64/amin_lsx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM0, VM1 + VFMINA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMINA VM1, VX0, VX1 + VFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + SUB $f0, $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S new file mode 100644 index 000000000..9a2c031f3 --- /dev/null +++ b/kernel/loongarch64/asum_lasx.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S new file mode 100644 index 000000000..512b01404 --- /dev/null +++ b/kernel/loongarch64/asum_lsx.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/axpby_lasx.S similarity index 55% rename from kernel/loongarch64/daxpby_lasx.S rename to kernel/loongarch64/axpby_lasx.S index 4b19703e7..f1d99cd3b 100644 --- a/kernel/loongarch64/daxpby_lasx.S +++ b/kernel/loongarch64/axpby_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA xvreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + xvreplgr2vr.w VXA, t1 + xvreplgr2vr.w VXB, t2 + xvreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,21 +85,22 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 .L111: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE @@ -77,6 +111,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -85,34 +126,46 @@ .L112: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA xvst VX0, Y, 0 * SIZE xvst VX1, Y, 4 * SIZE +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L112 b .L997 .align 3 .L113: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d I, I, -1 + addi.d Y, Y, 8 * SIZE blt $r0, I, .L113 b .L997 .align 3 .L114: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -122,21 +175,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -182,14 +236,59 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA @@ -208,14 +307,33 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -250,7 +368,6 @@ xvstelm.d VX2, YY, 0, 3 add.d YY, YY, INCY xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 1 @@ -258,12 +375,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -279,6 +440,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -287,21 +465,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -334,12 +513,43 @@ xvfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 xvst VX3, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -369,6 +579,35 @@ xvfmul.d VX1, VX1, VXA addi.d I, I, -1 xvst VX1, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -376,20 +615,27 @@ .L213: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -399,20 +645,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -481,12 +728,81 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -529,12 +845,56 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -577,12 +937,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -598,6 +1002,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -610,12 +1031,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/axpby_lsx.S similarity index 56% rename from kernel/loongarch64/daxpby_lsx.S rename to kernel/loongarch64/axpby_lsx.S index 9aafbaf2a..45154c262 100644 --- a/kernel/loongarch64/daxpby_lsx.S +++ b/kernel/loongarch64/axpby_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE vreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA vreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXA, t1 + vreplgr2vr.w VXB, t2 + vreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,15 +85,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 @@ -68,6 +101,7 @@ .L111: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vld VX3, Y, 2 * SIZE vfmul.d VX0, VX0, VXA @@ -86,6 +120,16 @@ vfmadd.d VX3, VX3, VXB, VX1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -95,6 +139,7 @@ .L112: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -106,6 +151,13 @@ vfmul.d VX3, VX3, VXA vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,7 +165,8 @@ b .L997 .align 3 -.L113: // ALPHA==0 BETA!=0\ +.L113: // ALPHA==0 BETA!=0 +#ifdef DOUBLE vld VX0, Y, 0 * SIZE vld VX1, Y, 2 * SIZE vfmul.d VX0, VX0, VXB @@ -126,6 +179,14 @@ vfmul.d VX3, VX3, VXB vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L113 @@ -134,9 +195,13 @@ .L114: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -146,21 +211,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -212,6 +278,53 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -220,6 +333,7 @@ .L122: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -242,6 +356,26 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 @@ -250,6 +384,7 @@ .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -294,12 +429,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -315,6 +495,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -323,21 +520,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -378,12 +576,47 @@ vfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -417,6 +650,37 @@ vfmul.d VX1, VX1, VXA addi.d I, I, -1 vst VX1, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -424,6 +688,7 @@ .L213: // ALPHA==0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX3, Y, 2 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB @@ -433,19 +698,30 @@ vld VX3, Y, 6 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -455,20 +731,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -541,12 +818,83 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -591,12 +939,57 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -641,12 +1034,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -662,6 +1100,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -674,12 +1129,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/axpy_lasx.S similarity index 52% rename from kernel/loongarch64/daxpy_lasx.S rename to kernel/loongarch64/axpy_lasx.S index bafd871ab..707fd09b5 100644 --- a/kernel/loongarch64/daxpy_lasx.S +++ b/kernel/loongarch64/axpy_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 +#else + xvreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -70,6 +102,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -77,6 +116,7 @@ .align 3 .L112: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -86,6 +126,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L112 @@ -97,11 +144,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -114,6 +161,7 @@ .align 3 .L121: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -158,6 +206,50 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -169,11 +261,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -185,6 +277,7 @@ .align 3 .L211: +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -217,6 +310,37 @@ addi.d I, I, -1 xvst VX3, Y, 4 * SIZE addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE +#endif blt $r0, I, .L211 .align 3 @@ -226,11 +350,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -243,6 +367,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -309,6 +434,73 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -319,15 +511,14 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 - b .L999 .align 3 .L999: diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/axpy_lsx.S similarity index 53% rename from kernel/loongarch64/daxpy_lsx.S rename to kernel/loongarch64/axpy_lsx.S index fc88f0bb9..0d74e2bce 100644 --- a/kernel/loongarch64/daxpy_lsx.S +++ b/kernel/loongarch64/axpy_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE vreplgr2vr.d VXA, t1 +#else + vreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -75,16 +107,27 @@ vld VX3, Y, 6 * SIZE vfadd.d VX2, VX0, VX2 vfadd.d VX3, VX1, VX3 - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L111 b .L113 .align 3 .L112: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -104,6 +147,19 @@ vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE addi.d Y, Y, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 +#endif blt $r0, I, .L112 .align 3 @@ -113,11 +169,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -130,6 +186,7 @@ .align 3 .L121: +#ifdef DOUBLE vld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -180,6 +237,54 @@ add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L121 .align 3 @@ -189,11 +294,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -205,6 +310,7 @@ .align 3 .L211: +#ifdef DOUBLE vld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -242,6 +348,39 @@ vfmadd.d VX3, VX1, VXA, VX3 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 .align 3 @@ -252,11 +391,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -269,6 +408,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -337,6 +477,74 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -347,11 +555,11 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..7013430cb --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmax.s VM1, x1, x2 + xvfmax.s VM0, x3, x4 + xvfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..2e55629de --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmax.s VM1, x1, VM1 + vfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmax.s VM1, x1, x2 + vfmax.s VM0, x3, x4 + vfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S new file mode 100644 index 000000000..d7931d30a --- /dev/null +++ b/kernel/loongarch64/camin_lasx.S @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define TEMP $r16 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + MTC s1, $r0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + xvreplve0.w VM0, VM0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmin.s VM1, x1, x2 + xvfmin.s VM0, x3, x4 + xvfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S new file mode 100644 index 000000000..e9ad6b04d --- /dev/null +++ b/kernel/loongarch64/camin_lsx.S @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + MTC s1, $r0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + vreplvei.w VM0, VM0, 0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmin.s VM1, x1, VM1 + vfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmin.s VM1, x1, x2 + vfmin.s VM0, x3, x4 + vfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S new file mode 100644 index 000000000..caf0ff969 --- /dev/null +++ b/kernel/loongarch64/casum_lasx.S @@ -0,0 +1,329 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res0 $xr19 +#define neg1 $xr20 +#define VT0 $xr21 +#define VT1 $xr22 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfmul.d VX0, neg1, VX2 + xvfmul.d VX1, neg1, VX3 + xvfcmp.clt.d VT0, VX2, res0 + xvfcmp.clt.d VT1, VX3, res0 + xvbitsel.v VX2, VX2, VX0, VT0 + xvbitsel.v VX3, VX3, VX1, VT1 + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S new file mode 100644 index 000000000..4822f2080 --- /dev/null +++ b/kernel/loongarch64/casum_lsx.S @@ -0,0 +1,358 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res0 $vr19 +#define neg1 $vr20 +#define VT0 $vr21 +#define VT1 $vr22 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vld VX0, X, 8 * SIZE + vld VX1, X, 12 * SIZE + addi.d I, I, -1 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res3, VX1, VX0 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfmul.s VX0, neg1, VX2 + vfmul.s VX1, neg1, VX3 + vfcmp.clt.s VT0, VX2, res0 + vfcmp.clt.s VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S new file mode 100644 index 000000000..2b970fe70 --- /dev/null +++ b/kernel/loongarch64/caxpy_lasx.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE +#endif + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif + addi.d I, I, -1 +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 +#else + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S new file mode 100644 index 000000000..85598d0b9 --- /dev/null +++ b/kernel/loongarch64/caxpy_lsx.S @@ -0,0 +1,679 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + vreplgr2vr.w VXAI, t2 +#endif + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 +#endif + add.d Y, Y, INCY +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S new file mode 100644 index 000000000..fbc5d96bc --- /dev/null +++ b/kernel/loongarch64/ccopy_lasx.S @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0 * SIZE, 2 + xvstelm.d VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 0 + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 2 + xvstelm.d VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 0 + xvstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 2 + xvstelm.d VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 0 + xvstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 2 + xvstelm.d VX3, Y, 1 * SIZE, 3 +#else + xvld VX1, X, 8 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 2 + xvstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 4 + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 6 + xvstelm.w VX0, Y, 1 * SIZE, 7 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 0 + xvstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 2 + xvstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 4 + xvstelm.w VX1, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 6 + xvstelm.w VX1, Y, 1 * SIZE, 7 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S new file mode 100644 index 000000000..4c4d880f1 --- /dev/null +++ b/kernel/loongarch64/ccopy_lsx.S @@ -0,0 +1,411 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11:// INCX==1 and INCY==1 + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + addi.d I, I, -1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + vst VX2, Y, 12 * SIZE + vst VX3, Y, 14 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0 * SIZE, 2 + vstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 0 + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 2 + vstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 0 + vstelm.w VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 2 + vstelm.w VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 0 + vstelm.w VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 2 + vstelm.w VX3, Y, 1 * SIZE, 3 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 12 * SIZE + vst VX1, Y, 14 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d Y, Y, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S new file mode 100644 index 000000000..0583e56ea --- /dev/null +++ b/kernel/loongarch64/cdot_lasx.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res4 $xr19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define x1 $xr20 +#define x2 $xr21 +#define x3 $xr22 +#define x4 $xr23 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res3, res3, res3 + xvxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + addi.d X, X, 8 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + addi.d X, X, 16 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d Y, Y, 8 * SIZE + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d Y, Y, 8 * SIZE + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + xvpickve.d VX1, res2, 1 + xvpickve.d VX2, res2, 2 + xvpickve.d VX3, res2, 3 + xvfadd.d res2, VX1, res2 + xvfadd.d res2, VX2, res2 + xvfadd.d res2, VX3, res2 + xvpickve.d VX1, res3, 1 + xvpickve.d VX2, res3, 2 + xvpickve.d VX3, res3, 3 + xvfadd.d res3, VX1, res3 + xvfadd.d res3, VX2, res3 + xvfadd.d res3, VX3, res3 + xvpickve.d VX1, res4, 1 + xvpickve.d VX2, res4, 2 + xvpickve.d VX3, res4, 3 + xvfadd.d res4, VX1, res4 + xvfadd.d res4, VX2, res4 + xvfadd.d res4, VX3, res4 +#else + xvpickve.w VX0, res1, 1 + xvpickve.w VX1, res1, 2 + xvpickve.w VX2, res1, 3 + xvpickve.w VX3, res1, 4 + xvpickve.w x1, res1, 5 + xvpickve.w x2, res1, 6 + xvpickve.w x3, res1, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, x1, res1 + xvfadd.s res1, x2, res1 + xvfadd.s res1, x3, res1 + xvpickve.w VX0, res2, 1 + xvpickve.w VX1, res2, 2 + xvpickve.w VX2, res2, 3 + xvpickve.w VX3, res2, 4 + xvpickve.w x1, res2, 5 + xvpickve.w x2, res2, 6 + xvpickve.w x3, res2, 7 + xvfadd.s res2, VX0, res2 + xvfadd.s res2, VX1, res2 + xvfadd.s res2, VX2, res2 + xvfadd.s res2, VX3, res2 + xvfadd.s res2, x1, res2 + xvfadd.s res2, x2, res2 + xvfadd.s res2, x3, res2 + xvpickve.w VX0, res3, 1 + xvpickve.w VX1, res3, 2 + xvpickve.w VX2, res3, 3 + xvpickve.w VX3, res3, 4 + xvpickve.w x1, res3, 5 + xvpickve.w x2, res3, 6 + xvpickve.w x3, res3, 7 + xvfadd.s res3, VX0, res3 + xvfadd.s res3, VX1, res3 + xvfadd.s res3, VX2, res3 + xvfadd.s res3, VX3, res3 + xvfadd.s res3, x1, res3 + xvfadd.s res3, x2, res3 + xvfadd.s res3, x3, res3 + xvpickve.w VX0, res4, 1 + xvpickve.w VX1, res4, 2 + xvpickve.w VX2, res4, 3 + xvpickve.w VX3, res4, 4 + xvpickve.w x1, res4, 5 + xvpickve.w x2, res4, 6 + xvpickve.w x3, res4, 7 + xvfadd.s res4, VX0, res4 + xvfadd.s res4, VX1, res4 + xvfadd.s res4, VX2, res4 + xvfadd.s res4, VX3, res4 + xvfadd.s res4, x1, res4 + xvfadd.s res4, x2, res4 + xvfadd.s res4, x3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S new file mode 100644 index 000000000..5feea12be --- /dev/null +++ b/kernel/loongarch64/cdot_lsx.S @@ -0,0 +1,397 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res4 $vr19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define x1 $vr20 +#define x2 $vr21 +#define x3 $vr22 +#define x4 $vr23 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res3, res3, res3 + vxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 1 +#else + srai.d I, N, 2 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + addi.d X, X, 4 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + addi.d X, X, 8 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + addi.d Y, Y, 4 * SIZE + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d Y, Y, 8 * SIZE + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + vreplvei.d VX1, res2, 1 + vfadd.d res2, VX1, res2 + vreplvei.d VX1, res3, 1 + vfadd.d res3, VX1, res3 + vreplvei.d VX1, res4, 1 + vfadd.d res4, VX1, res4 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + vreplvei.w VX1, res2, 1 + vreplvei.w VX2, res2, 2 + vreplvei.w VX3, res2, 3 + vfadd.s res2, VX1, res2 + vfadd.s res2, VX2, res2 + vfadd.s res2, VX3, res2 + vreplvei.w VX1, res3, 1 + vreplvei.w VX2, res3, 2 + vreplvei.w VX3, res3, 3 + vfadd.s res3, VX1, res3 + vfadd.s res3, VX2, res3 + vfadd.s res3, VX3, res3 + vreplvei.w VX1, res4, 1 + vreplvei.w VX2, res4, 2 + vreplvei.w VX3, res4, 3 + vfadd.s res4, VX1, res4 + vfadd.s res4, VX2, res4 + vfadd.s res4, VX3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S new file mode 100644 index 000000000..e07f7dc64 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S @@ -0,0 +1,857 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFSUB +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFADD +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFADD +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFSUB +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvpermi.d U0, U0, 0xd8 + xvpermi.d U2, U2, 0xd8 + + xvst U0, TD, 0x00 + xvst U2, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S new file mode 100644 index 000000000..1cf4d87dc --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S @@ -0,0 +1,202 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + slli.d T0, TL, 0x01 + + srai.d I, N, 0x01 + beq I, ZERO, .L_N0 + +.L_J1: /* if (i > 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..e2245e412 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvst U0, S8, 0x00 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + xvst U2, S8, 0x00 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S8, 0x00 + vst $vr1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld $vr0, TS, 0x00 + vld $vr1, TS, 0x10 + + vst $vr0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst $vr1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, TS, 0x00 + + vst $vr0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S new file mode 100644 index 000000000..15c0fde8f --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U2, S8, 0x10 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + vst U1, S8, 0x00 + vst U3, S8, 0x10 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, TS, 0x00 + + vst U0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S new file mode 100644 index 000000000..d4ec1e22c --- /dev/null +++ b/kernel/loongarch64/crot_lasx.S @@ -0,0 +1,1079 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr11 +#define x1 $xr12 +#define x2 $xr13 +#define x3 $xr14 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXC, t1 + xvreplgr2vr.d VXS, t2 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXC, t1 + xvreplgr2vr.w VXS, t2 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + xvst VXZ, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 +#else + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 + .align 3 + +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S new file mode 100644 index 000000000..126257edc --- /dev/null +++ b/kernel/loongarch64/crot_lsx.S @@ -0,0 +1,907 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr11 +#define x1 $vr12 +#define x2 $vr13 +#define x3 $vr14 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXC, t1 + vreplgr2vr.d VXS, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXC, t1 + vreplgr2vr.w VXS, t2 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 2 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: +#ifdef DOUBLE + srai.d I, N, 2 +#endif + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 +#endif + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 +#endif + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 +#endif + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L995 +#else + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 +#endif + .align 3 + +#ifdef DOUBLE + .L995: + andi I, N, 3 + bge $r0, I, .L999 + b .L998 + .align 3 + +#endif +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S new file mode 100644 index 000000000..3605a6c0e --- /dev/null +++ b/kernel/loongarch64/cscal_lasx.S @@ -0,0 +1,645 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + xvreplgr2vr.d VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + xvreplgr2vr.w VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 +#else + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S new file mode 100644 index 000000000..f442a754f --- /dev/null +++ b/kernel/loongarch64/cscal_lsx.S @@ -0,0 +1,571 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 +#endif + vxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + vst VXZ, X, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, X, 6 * SIZE +#else + vst VXZ, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 +#else + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S new file mode 100644 index 000000000..3e65f2c15 --- /dev/null +++ b/kernel/loongarch64/csum_lasx.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S new file mode 100644 index 000000000..8de8e27ca --- /dev/null +++ b/kernel/loongarch64/csum_lsx.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S new file mode 100644 index 000000000..d53773d5a --- /dev/null +++ b/kernel/loongarch64/cswap_lasx.S @@ -0,0 +1,394 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 4 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 6 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 4 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 6 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S new file mode 100644 index 000000000..62a869066 --- /dev/null +++ b/kernel/loongarch64/cswap_lsx.S @@ -0,0 +1,421 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S deleted file mode 100644 index c44ce4995..000000000 --- a/kernel/loongarch64/damax_lasx.S +++ /dev/null @@ -1,183 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define J $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define TEMP $r16 -#define m0 $xr8 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define x5 $xr13 -#define x6 $xr14 -#define x7 $xr15 -#define x8 $xr16 -#define VX0 $xr20 -#define VX1 $xr21 -#define VM0 $xr22 -#define VM1 $xr23 -#define VM2 $xr18 -#define VM3 $xr19 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - srai.d I, N, 3 - bge $r0, I, .L12 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - addi.d I, I, -1 - xvfmaxa.d VM1, VX1, VX0 - addi.d X, X, 8 * SIZE - xvfmaxa.d VM0, VM0, VM1 - blt $r0, I, .L10 - .align 3 - -.L11: - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfmaxa.d VM2, x3, x4 - xvfmaxa.d VM0, VM1, VM2 - .align 3 - -.L12: //INCX==1 and N<8 - andi I, N, 7 - li.d J, 4 - bge J, I, .L13 // 4> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + preld 0, C0, 0x20 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + preld 0, C1, 0x20 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + vldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + preld 0, C2, 0x20 + vfmul.d D10, U2, U6 + vfmul.d D11, U3, U6 + + vldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + preld 0, C3, 0x20 + vfmul.d D14, U2, U7 + vfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x8x4 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + +.L_TL1_END: + KERNEL8x8x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + /* Cumulative D0~D15 */ + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D10, D10, VALPHA + vfmul.d D11, D11, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA + vfmul.d D14, D14, VALPHA + vfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ + vld U8, C2, 0x00 + vld U9, C2, 0x10 + vld U10, C2, 0x20 + vld U11, C2, 0x30 + vfmadd.d D8, D8, VALPHA, U8 + vfmadd.d D9, D9, VALPHA, U9 + vfmadd.d D10, D10, VALPHA, U10 + vfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + vld U0, C3, 0x00 + vld U1, C3, 0x10 + vld U2, C3, 0x20 + vld U3, C3, 0x30 + vfmadd.d D12, D12, VALPHA, U0 + vfmadd.d D13, D13, VALPHA, U1 + vfmadd.d D14, D14, VALPHA, U2 + vfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + vst D10, C2, 0x20 + vst D11, C2, 0x30 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + vst D14, C3, 0x20 + vst D15, C3, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_M0 + + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + vldrepl.d U6, B0, 0x10 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + + vldrepl.d U7, B0, 0x18 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ + vld U4, C2, 0x00 + vld U5, C2, 0x10 + vfmadd.d D8, D8, VALPHA, U4 + vfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ + vld U6, C3, 0x00 + vld U7, C3, 0x10 + vfmadd.d D12, D12, VALPHA, U6 + vfmadd.d D13, D13, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + vfmul.d D2, U1, U4 + vfmul.d D3, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 + vstelm.d D2, C0, 0x08, 0x00 + vstelm.d D2, C1, 0x08, 0x01 + vstelm.d D3, C2, 0x08, 0x00 + vstelm.d D3, C3, 0x08, 0x01 +#else + /* Load C0 */ + vld U0, C0, 0x00 + /* Load C1 */ + vld U1, C1, 0x00 + /* Load C2 */ + vld U2, C2, 0x00 + /* Load C3 */ + vld U3, C3, 0x00 + + vilvl.d D4, D2, D0 //C0 + vilvh.d D5, D2, D0 //C1 + vilvl.d D6, D3, D1 //C2 + vilvh.d D7, D3, D1 //C3 + + vfmadd.d D0, D4, VALPHA, U0 + vfmadd.d D2, D5, VALPHA, U1 + vfmadd.d D1, D6, VALPHA, U2 + vfmadd.d D3, D7, VALPHA, U3 + + vst D0, C0, 0x00 + vst D2, C1, 0x00 + vst D1, C2, 0x00 + vst D3, C3, 0x00 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vldrepl.d U1, C1, 0x00 + vilvl.d D4, U1, U0 + vfmadd.d D6, D0, VALPHA, D4 + + vldrepl.d U2, C2, 0x00 + vldrepl.d U3, C3, 0x00 + vilvl.d D5, U3, U2 + vfmadd.d D7, D1, VALPHA, D5 + + vstelm.d D6, C0, 0x00, 0x00 + vstelm.d D6, C1, 0x00, 0x01 + vstelm.d D7, C2, 0x00, 0x00 + vstelm.d D7, C3, 0x00, 0x01 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + + vldrepl.d VALPHA, $sp, 104 + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N3_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x8x2_END + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 8, considering M=4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N3_M0 + + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + vldrepl.d U4, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + vfmadd.d D0, U0, U4, D0 + + vfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + vld U1, C1, 0x00 + vfmadd.d D4, D4, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + vst D0, C0, 0x00 + vst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C1, 0x00 + vilvl.d U2, U1, U0 + vfmadd.d D0, D0, VALPHA, U2 +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N1_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a3, a2} + * U1 = {a1, a0} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x8x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x8x1_END + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N1_M0 + + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f24, $sp, 40 + LD $f25, $sp, 48 + LD $f26, $sp, 56 + LD $f27, $sp, 64 + LD $f28, $sp, 72 + LD $f29, $sp, 80 + LD $f30, $sp, 88 + LD $f31, $sp, 96 + addi.d $sp, $sp, 112 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..048a49af6 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S @@ -0,0 +1,185 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 +.L_I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_2I3: + andi I, M, 0x01 + beq ZERO, I, .L_N1 +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..30bebe8df --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -0,0 +1,283 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + push_if_used 26, 32 + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x03 + beq J, ZERO, .L_N4 +.L_J1: + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_I7 +.L_I1: + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ + U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ + U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ + U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ + U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 +.L_4I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..134066471 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -0,0 +1,280 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 18, 8 + + move S0, SRC + move P0, DST + + // Find P0, P2, P3 + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + pop_if_used 18, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..a7e3ef69c --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -0,0 +1,597 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x03 + srai.d T1, N, 0x02 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x02 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x01 + slli.d T0, T0, 0x01 + mul.d P4, M, T0 + slli.d P4, P4, 0x03 + add.d P4, DST, P4 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x06 + beq ZERO, J, .L_M7 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x03 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S5, 0x20 + vld U3, S5, 0x30 + vld U4, S6, 0x00 + vld U5, S6, 0x10 + vld U6, S6, 0x20 + vld U7, S6, 0x30 + + vst U0, P1, 0x100 + vst U1, P1, 0x110 + vst U2, P1, 0x120 + vst U3, P1, 0x130 + vst U4, P1, 0x140 + vst U5, P1, 0x150 + vst U6, P1, 0x160 + vst U7, P1, 0x170 + + vld U0, S7, 0x00 + vld U1, S7, 0x10 + vld U2, S7, 0x20 + vld U3, S7, 0x30 + vld U4, S8, 0x00 + vld U5, S8, 0x10 + vld U6, S8, 0x20 + vld U7, S8, 0x30 + + vst U0, P1, 0x180 + vst U1, P1, 0x190 + vst U2, P1, 0x1a0 + vst U3, P1, 0x1b0 + vst U4, P1, 0x1c0 + vst U5, P1, 0x1d0 + vst U6, P1, 0x1e0 + vst U7, P1, 0x1f0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S6, 0x00 + vld U3, S6, 0x10 + vld U4, S7, 0x00 + vld U5, S7, 0x10 + vld U6, S8, 0x00 + vld U7, S8, 0x10 + + vst U0, P2, 0x80 + vst U1, P2, 0x90 + vst U2, P2, 0xa0 + vst U3, P2, 0xb0 + vst U4, P2, 0xc0 + vst U5, P2, 0xd0 + vst U6, P2, 0xe0 + vst U7, P2, 0xf0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + vld U4, S5, 0x00 + vld U5, S6, 0x00 + vld U6, S7, 0x00 + vld U7, S8, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + vst U4, P3, 0x40 + vst U5, P3, 0x50 + vst U6, P3, 0x60 + vst U7, P3, 0x70 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x03 + beq ZERO, I, .L_4N7 +.L_4I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x03 + beq ZERO, I, .L_2N7 +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x03 + beq ZERO, I, .L_1N7 +.L_1I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + vld U0, S1, 0x00 + vst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dmax_lasx.S b/kernel/loongarch64/dmax_lasx.S deleted file mode 100644 index 46366d2ec..000000000 --- a/kernel/loongarch64/dmax_lasx.S +++ /dev/null @@ -1,175 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define J $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define TEMP $r16 -#define m0 $xr8 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr20 -#define VX1 $xr21 -#define VM0 $xr22 -#define VM1 $xr23 -#define VM2 $xr19 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - srai.d I, N, 3 - bge $r0, I, .L12 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - addi.d I, I, -1 - xvfmax.d VM1, VX1, VX0 - addi.d X, X, 8 * SIZE - xvfmax.d VM0, VM0, VM1 - blt $r0, I, .L10 - .align 3 - -.L11: - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmax.d VM1, x1, x2 - xvfmax.d VM2, x3, x4 - xvfmax.d VM0, VM1, VM2 - .align 3 - -.L12: //INCX==1 and N<8 - andi I, N, 7 - li.d J, 4 - bge J, I, .L13 // 4 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset1 + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TS, TS, 0x40 // a_offset + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + vld $vr0, TS, 0x00 + vst $vr0, TD, 0x00 + + + addi.d TS, TS, 0x10 // a_offset + addi.d TD, TD, 0x10 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..3fe17beef --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S @@ -0,0 +1,212 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U2, S8, 0x20 + + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + xvst U1, S8, 0x00 + xvst U3, S8, 0x20 + + add.d S8, S8, T0 + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S9, S9, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + + slli.d T0, M, 0x05 + add.d TD, TD, T0 + + xvst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, TS, 0x00 + + vst $vr0, S9, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/param.h b/param.h index 1ec2d16dd..03cea76c9 100644 --- a/param.h +++ b/param.h @@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 @@ -2888,8 +2888,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4