Merge branch 'risc-v' into develop
This commit is contained in:
@@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = axpy_vector.c
|
||||
DAXPYKERNEL = axpy_vector.c
|
||||
CAXPYKERNEL = zaxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
CAXPYKERNEL = zaxpy_vector.c
|
||||
ZAXPYKERNEL = zaxpy_vector.c
|
||||
|
||||
SAXPBYKERNEL = axpby_vector.c
|
||||
DAXPBYKERNEL = axpby_vector.c
|
||||
@@ -59,7 +59,7 @@ SDOTKERNEL = dot_vector.c
|
||||
DDOTKERNEL = dot_vector.c
|
||||
CDOTKERNEL = zdot_vector.c
|
||||
ZDOTKERNEL = zdot_vector.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
DSDOTKERNEL = dsdot_vector.c
|
||||
|
||||
SNRM2KERNEL = nrm2_vector.c
|
||||
DNRM2KERNEL = nrm2_vector.c
|
||||
|
||||
@@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c
|
||||
CAXPYKERNEL = ../riscv64/zaxpy.c
|
||||
ZAXPYKERNEL = ../riscv64/zaxpy.c
|
||||
|
||||
SAXPBYKERNEL = ../riscv64/axpby.c
|
||||
DAXPBYKERNEL = ../riscv64/axpby.c
|
||||
CAXPBYKERNEL = ../riscv64/zaxpby.c
|
||||
ZAXPBYKERNEL = ../riscv64/zaxpby.c
|
||||
|
||||
SCOPYKERNEL = ../riscv64/copy.c
|
||||
DCOPYKERNEL = ../riscv64/copy.c
|
||||
CCOPYKERNEL = ../riscv64/zcopy.c
|
||||
|
||||
243
kernel/riscv64/KERNEL.RISCV64_ZVL128B
Normal file
243
kernel/riscv64/KERNEL.RISCV64_ZVL128B
Normal file
@@ -0,0 +1,243 @@
|
||||
SAMAXKERNEL = amax_rvv.c
|
||||
DAMAXKERNEL = amax_rvv.c
|
||||
CAMAXKERNEL = zamax_rvv.c
|
||||
ZAMAXKERNEL = zamax_rvv.c
|
||||
|
||||
SAMINKERNEL = amin_rvv.c
|
||||
DAMINKERNEL = amin_rvv.c
|
||||
CAMINKERNEL = zamin_rvv.c
|
||||
ZAMINKERNEL = zamin_rvv.c
|
||||
|
||||
SMAXKERNEL = max_rvv.c
|
||||
DMAXKERNEL = max_rvv.c
|
||||
|
||||
SMINKERNEL = min_rvv.c
|
||||
DMINKERNEL = min_rvv.c
|
||||
|
||||
ISAMAXKERNEL = iamax_rvv.c
|
||||
IDAMAXKERNEL = iamax_rvv.c
|
||||
ICAMAXKERNEL = izamax_rvv.c
|
||||
IZAMAXKERNEL = izamax_rvv.c
|
||||
|
||||
ISAMINKERNEL = iamin_rvv.c
|
||||
IDAMINKERNEL = iamin_rvv.c
|
||||
ICAMINKERNEL = izamin_rvv.c
|
||||
IZAMINKERNEL = izamin_rvv.c
|
||||
|
||||
ISMAXKERNEL = imax_rvv.c
|
||||
IDMAXKERNEL = imax_rvv.c
|
||||
|
||||
ISMINKERNEL = imin_rvv.c
|
||||
IDMINKERNEL = imin_rvv.c
|
||||
|
||||
SASUMKERNEL = asum_rvv.c
|
||||
DASUMKERNEL = asum_rvv.c
|
||||
CASUMKERNEL = zasum_rvv.c
|
||||
ZASUMKERNEL = zasum_rvv.c
|
||||
|
||||
SSUMKERNEL = sum_rvv.c
|
||||
DSUMKERNEL = sum_rvv.c
|
||||
CSUMKERNEL = zsum_rvv.c
|
||||
ZSUMKERNEL = zsum_rvv.c
|
||||
|
||||
SAXPYKERNEL = axpy_rvv.c
|
||||
DAXPYKERNEL = axpy_rvv.c
|
||||
CAXPYKERNEL = zaxpy_rvv.c
|
||||
ZAXPYKERNEL = zaxpy_rvv.c
|
||||
|
||||
SAXPBYKERNEL = axpby_rvv.c
|
||||
DAXPBYKERNEL = axpby_rvv.c
|
||||
CAXPBYKERNEL = zaxpby_rvv.c
|
||||
ZAXPBYKERNEL = zaxpby_rvv.c
|
||||
|
||||
SCOPYKERNEL = copy_rvv.c
|
||||
DCOPYKERNEL = copy_rvv.c
|
||||
CCOPYKERNEL = zcopy_rvv.c
|
||||
ZCOPYKERNEL = zcopy_rvv.c
|
||||
|
||||
SDOTKERNEL = dot_rvv.c
|
||||
DDOTKERNEL = dot_rvv.c
|
||||
CDOTKERNEL = zdot_rvv.c
|
||||
ZDOTKERNEL = zdot_rvv.c
|
||||
DSDOTKERNEL = dot_rvv.c
|
||||
|
||||
SNRM2KERNEL = nrm2_rvv.c
|
||||
DNRM2KERNEL = nrm2_rvv.c
|
||||
CNRM2KERNEL = znrm2_rvv.c
|
||||
ZNRM2KERNEL = znrm2_rvv.c
|
||||
|
||||
SROTKERNEL = rot_rvv.c
|
||||
DROTKERNEL = rot_rvv.c
|
||||
CROTKERNEL = zrot_rvv.c
|
||||
ZROTKERNEL = zrot_rvv.c
|
||||
|
||||
SSCALKERNEL = scal_rvv.c
|
||||
DSCALKERNEL = scal_rvv.c
|
||||
CSCALKERNEL = zscal_rvv.c
|
||||
ZSCALKERNEL = zscal_rvv.c
|
||||
|
||||
SSWAPKERNEL = swap_rvv.c
|
||||
DSWAPKERNEL = swap_rvv.c
|
||||
CSWAPKERNEL = zswap_rvv.c
|
||||
ZSWAPKERNEL = zswap_rvv.c
|
||||
|
||||
SGEMVNKERNEL = gemv_n_rvv.c
|
||||
DGEMVNKERNEL = gemv_n_rvv.c
|
||||
CGEMVNKERNEL = zgemv_n_rvv.c
|
||||
ZGEMVNKERNEL = zgemv_n_rvv.c
|
||||
|
||||
SGEMVTKERNEL = gemv_t_rvv.c
|
||||
DGEMVTKERNEL = gemv_t_rvv.c
|
||||
CGEMVTKERNEL = zgemv_t_rvv.c
|
||||
ZGEMVTKERNEL = zgemv_t_rvv.c
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
|
||||
STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
|
||||
STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
|
||||
STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
|
||||
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
|
||||
DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
|
||||
DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
|
||||
DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
|
||||
DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
|
||||
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
|
||||
CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
|
||||
CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
|
||||
CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
|
||||
CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
|
||||
ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SSYMV_U_KERNEL = symv_U_rvv.c
|
||||
SSYMV_L_KERNEL = symv_L_rvv.c
|
||||
DSYMV_U_KERNEL = symv_U_rvv.c
|
||||
DSYMV_L_KERNEL = symv_L_rvv.c
|
||||
CSYMV_U_KERNEL = zsymv_U_rvv.c
|
||||
CSYMV_L_KERNEL = zsymv_L_rvv.c
|
||||
ZSYMV_U_KERNEL = zsymv_U_rvv.c
|
||||
ZSYMV_L_KERNEL = zsymv_L_rvv.c
|
||||
|
||||
CHEMV_L_KERNEL = zhemv_LM_rvv.c
|
||||
CHEMV_M_KERNEL = zhemv_LM_rvv.c
|
||||
CHEMV_U_KERNEL = zhemv_UV_rvv.c
|
||||
CHEMV_V_KERNEL = zhemv_UV_rvv.c
|
||||
ZHEMV_L_KERNEL = zhemv_LM_rvv.c
|
||||
ZHEMV_M_KERNEL = zhemv_LM_rvv.c
|
||||
ZHEMV_U_KERNEL = zhemv_UV_rvv.c
|
||||
ZHEMV_V_KERNEL = zhemv_UV_rvv.c
|
||||
|
||||
SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c
|
||||
SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c
|
||||
|
||||
DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c
|
||||
DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c
|
||||
|
||||
CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
|
||||
CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
|
||||
|
||||
ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
|
||||
ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
|
||||
|
||||
CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
|
||||
|
||||
ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
SGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
ifndef DGEMM_BETA
|
||||
DGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
ifndef CGEMM_BETA
|
||||
CGEMM_BETA = zgemm_beta_rvv.c
|
||||
endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = zgemm_beta_rvv.c
|
||||
endif
|
||||
199
kernel/riscv64/KERNEL.RISCV64_ZVL256B
Normal file
199
kernel/riscv64/KERNEL.RISCV64_ZVL256B
Normal file
@@ -0,0 +1,199 @@
|
||||
SAMAXKERNEL = amax_vector.c
|
||||
DAMAXKERNEL = amax_vector.c
|
||||
CAMAXKERNEL = zamax_vector.c
|
||||
ZAMAXKERNEL = zamax_vector.c
|
||||
|
||||
SAMINKERNEL = amin_vector.c
|
||||
DAMINKERNEL = amin_vector.c
|
||||
CAMINKERNEL = zamin_vector.c
|
||||
ZAMINKERNEL = zamin_vector.c
|
||||
|
||||
SMAXKERNEL = max_vector.c
|
||||
DMAXKERNEL = max_vector.c
|
||||
|
||||
SMINKERNEL = min_vector.c
|
||||
DMINKERNEL = min_vector.c
|
||||
|
||||
ISAMAXKERNEL = iamax_vector.c
|
||||
IDAMAXKERNEL = iamax_vector.c
|
||||
ICAMAXKERNEL = izamax_vector.c
|
||||
IZAMAXKERNEL = izamax_vector.c
|
||||
|
||||
ISAMINKERNEL = iamin_vector.c
|
||||
IDAMINKERNEL = iamin_vector.c
|
||||
ICAMINKERNEL = izamin_vector.c
|
||||
IZAMINKERNEL = izamin_vector.c
|
||||
|
||||
ISMAXKERNEL = imax_vector.c
|
||||
IDMAXKERNEL = imax_vector.c
|
||||
|
||||
ISMINKERNEL = imin_vector.c
|
||||
IDMINKERNEL = imin_vector.c
|
||||
|
||||
SASUMKERNEL = asum_vector.c
|
||||
DASUMKERNEL = asum_vector.c
|
||||
CASUMKERNEL = zasum_vector.c
|
||||
ZASUMKERNEL = zasum_vector.c
|
||||
|
||||
SSUMKERNEL = sum_vector.c
|
||||
DSUMKERNEL = sum_vector.c
|
||||
CSUMKERNEL = zsum_vector.c
|
||||
ZSUMKERNEL = zsum_vector.c
|
||||
|
||||
SAXPYKERNEL = axpy_vector.c
|
||||
DAXPYKERNEL = axpy_vector.c
|
||||
CAXPYKERNEL = zaxpy_vector.c
|
||||
ZAXPYKERNEL = zaxpy_vector.c
|
||||
|
||||
SCOPYKERNEL = copy_vector.c
|
||||
DCOPYKERNEL = copy_vector.c
|
||||
CCOPYKERNEL = zcopy_vector.c
|
||||
ZCOPYKERNEL = zcopy_vector.c
|
||||
|
||||
SDOTKERNEL = dot_vector.c
|
||||
DDOTKERNEL = dot_vector.c
|
||||
CDOTKERNEL = zdot_vector.c
|
||||
ZDOTKERNEL = zdot_vector.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
|
||||
SNRM2KERNEL = nrm2_vector.c
|
||||
DNRM2KERNEL = nrm2_vector.c
|
||||
CNRM2KERNEL = znrm2_vector.c
|
||||
ZNRM2KERNEL = znrm2_vector.c
|
||||
|
||||
SROTKERNEL = rot_vector.c
|
||||
DROTKERNEL = rot_vector.c
|
||||
CROTKERNEL = zrot_vector.c
|
||||
ZROTKERNEL = zrot_vector.c
|
||||
|
||||
SSCALKERNEL = scal_vector.c
|
||||
DSCALKERNEL = scal_vector.c
|
||||
CSCALKERNEL = zscal_vector.c
|
||||
ZSCALKERNEL = zscal_vector.c
|
||||
|
||||
SSWAPKERNEL = swap_vector.c
|
||||
DSWAPKERNEL = swap_vector.c
|
||||
CSWAPKERNEL = zswap_vector.c
|
||||
ZSWAPKERNEL = zswap_vector.c
|
||||
|
||||
SGEMVNKERNEL = gemv_n_vector.c
|
||||
DGEMVNKERNEL = gemv_n_vector.c
|
||||
CGEMVNKERNEL = zgemv_n_vector.c
|
||||
ZGEMVNKERNEL = zgemv_n_vector.c
|
||||
|
||||
SGEMVTKERNEL = gemv_t_vector.c
|
||||
DGEMVTKERNEL = gemv_t_vector.c
|
||||
CGEMVTKERNEL = zgemv_t_vector.c
|
||||
ZGEMVTKERNEL = zgemv_t_vector.c
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SSYMV_U_KERNEL = symv_U_vector.c
|
||||
SSYMV_L_KERNEL = symv_L_vector.c
|
||||
DSYMV_U_KERNEL = symv_U_vector.c
|
||||
DSYMV_L_KERNEL = symv_L_vector.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
CHEMV_L_KERNEL = zhemv_LM_vector.c
|
||||
CHEMV_M_KERNEL = zhemv_LM_vector.c
|
||||
CHEMV_U_KERNEL = zhemv_UV_vector.c
|
||||
CHEMV_V_KERNEL = zhemv_UV_vector.c
|
||||
ZHEMV_L_KERNEL = zhemv_LM_vector.c
|
||||
ZHEMV_M_KERNEL = zhemv_LM_vector.c
|
||||
ZHEMV_U_KERNEL = zhemv_UV_vector.c
|
||||
ZHEMV_V_KERNEL = zhemv_UV_vector.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef DGEMM_BETA
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef CGEMM_BETA
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
281
kernel/riscv64/KERNEL.x280
Normal file
281
kernel/riscv64/KERNEL.x280
Normal file
@@ -0,0 +1,281 @@
|
||||
# **********************************************************************************
|
||||
# Copyright (c) 2022, The OpenBLAS Project
|
||||
# All rights reserved.
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# 3. Neither the name of the OpenBLAS project nor the names of
|
||||
# its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# **********************************************************************************
|
||||
|
||||
SAMAXKERNEL = amax_rvv.c
|
||||
DAMAXKERNEL = amax_rvv.c
|
||||
CAMAXKERNEL = zamax_rvv.c
|
||||
ZAMAXKERNEL = zamax_rvv.c
|
||||
|
||||
SAMINKERNEL = amin_rvv.c
|
||||
DAMINKERNEL = amin_rvv.c
|
||||
CAMINKERNEL = zamin_rvv.c
|
||||
ZAMINKERNEL = zamin_rvv.c
|
||||
|
||||
SMAXKERNEL = max_rvv.c
|
||||
DMAXKERNEL = max_rvv.c
|
||||
|
||||
SMINKERNEL = min_rvv.c
|
||||
DMINKERNEL = min_rvv.c
|
||||
|
||||
ISAMAXKERNEL = iamax_rvv.c
|
||||
IDAMAXKERNEL = iamax_rvv.c
|
||||
ICAMAXKERNEL = izamax_rvv.c
|
||||
IZAMAXKERNEL = izamax_rvv.c
|
||||
|
||||
ISAMINKERNEL = iamin_rvv.c
|
||||
IDAMINKERNEL = iamin_rvv.c
|
||||
ICAMINKERNEL = izamin_rvv.c
|
||||
IZAMINKERNEL = izamin_rvv.c
|
||||
|
||||
ISMAXKERNEL = imax_rvv.c
|
||||
IDMAXKERNEL = imax_rvv.c
|
||||
|
||||
ISMINKERNEL = imin_rvv.c
|
||||
IDMINKERNEL = imin_rvv.c
|
||||
|
||||
SASUMKERNEL = asum_rvv.c
|
||||
DASUMKERNEL = asum_rvv.c
|
||||
CASUMKERNEL = zasum_rvv.c
|
||||
ZASUMKERNEL = zasum_rvv.c
|
||||
|
||||
SSUMKERNEL = sum_rvv.c
|
||||
DSUMKERNEL = sum_rvv.c
|
||||
CSUMKERNEL = zsum_rvv.c
|
||||
ZSUMKERNEL = zsum_rvv.c
|
||||
|
||||
SAXPYKERNEL = axpy_rvv.c
|
||||
DAXPYKERNEL = axpy_rvv.c
|
||||
CAXPYKERNEL = zaxpy_rvv.c
|
||||
ZAXPYKERNEL = zaxpy_rvv.c
|
||||
|
||||
SAXPBYKERNEL = axpby_rvv.c
|
||||
DAXPBYKERNEL = axpby_rvv.c
|
||||
CAXPBYKERNEL = zaxpby_rvv.c
|
||||
ZAXPBYKERNEL = zaxpby_rvv.c
|
||||
|
||||
SCOPYKERNEL = copy_rvv.c
|
||||
DCOPYKERNEL = copy_rvv.c
|
||||
CCOPYKERNEL = zcopy_rvv.c
|
||||
ZCOPYKERNEL = zcopy_rvv.c
|
||||
|
||||
SDOTKERNEL = dot_rvv.c
|
||||
DDOTKERNEL = dot_rvv.c
|
||||
CDOTKERNEL = zdot_rvv.c
|
||||
ZDOTKERNEL = zdot_rvv.c
|
||||
DSDOTKERNEL = dot_rvv.c
|
||||
|
||||
SNRM2KERNEL = nrm2_rvv.c
|
||||
DNRM2KERNEL = nrm2_rvv.c
|
||||
CNRM2KERNEL = znrm2_rvv.c
|
||||
ZNRM2KERNEL = znrm2_rvv.c
|
||||
|
||||
SROTKERNEL = rot_rvv.c
|
||||
DROTKERNEL = rot_rvv.c
|
||||
CROTKERNEL = zrot_rvv.c
|
||||
ZROTKERNEL = zrot_rvv.c
|
||||
|
||||
SSCALKERNEL = scal_rvv.c
|
||||
DSCALKERNEL = scal_rvv.c
|
||||
CSCALKERNEL = zscal_rvv.c
|
||||
ZSCALKERNEL = zscal_rvv.c
|
||||
|
||||
SSWAPKERNEL = swap_rvv.c
|
||||
DSWAPKERNEL = swap_rvv.c
|
||||
CSWAPKERNEL = zswap_rvv.c
|
||||
ZSWAPKERNEL = zswap_rvv.c
|
||||
|
||||
SGEMVNKERNEL = gemv_n_rvv.c
|
||||
DGEMVNKERNEL = gemv_n_rvv.c
|
||||
CGEMVNKERNEL = zgemv_n_rvv.c
|
||||
ZGEMVNKERNEL = zgemv_n_rvv.c
|
||||
|
||||
SGEMVTKERNEL = gemv_t_rvv.c
|
||||
DGEMVTKERNEL = gemv_t_rvv.c
|
||||
CGEMVTKERNEL = zgemv_t_rvv.c
|
||||
ZGEMVTKERNEL = zgemv_t_rvv.c
|
||||
|
||||
CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c
|
||||
ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c
|
||||
|
||||
# SGEMM_UNROLL_N set in params.h
|
||||
ifeq ($(SGEMM_UNROLL_N), 8)
|
||||
# UNROLL_M is VLMAX
|
||||
SGEMMKERNEL = gemmkernel_rvv_v1x8.c
|
||||
SGEMMINCOPY = gemm_ncopy_rvv_v1.c
|
||||
SGEMMITCOPY = gemm_tcopy_rvv_v1.c
|
||||
SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
|
||||
SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMKERNEL = trmmkernel_rvv_v1x8.c
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_rvv_v1.c
|
||||
SSYMMLCOPY_M = symm_lcopy_rvv_v1.c
|
||||
endif
|
||||
|
||||
# SGEMM_UNROLL_N set in params.h
|
||||
ifeq ($(DGEMM_UNROLL_N), 8)
|
||||
# UNROLL_M is VLMAX
|
||||
DGEMMKERNEL = gemmkernel_rvv_v1x8.c
|
||||
DGEMMINCOPY = gemm_ncopy_rvv_v1.c
|
||||
DGEMMITCOPY = gemm_tcopy_rvv_v1.c
|
||||
DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c
|
||||
DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = trmmkernel_rvv_v1x8.c
|
||||
DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_rvv_v1.c
|
||||
DSYMMLCOPY_M = symm_lcopy_rvv_v1.c
|
||||
endif
|
||||
|
||||
CGEMMKERNEL = zgemmkernel_rvv_v1x4.c
|
||||
CGEMMINCOPY = zgemm_ncopy_rvv_v1.c
|
||||
CGEMMITCOPY = zgemm_tcopy_rvv_v1.c
|
||||
CGEMMONCOPY = zgemm_ncopy_4_rvv.c
|
||||
CGEMMOTCOPY = zgemm_tcopy_4_rvv.c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c
|
||||
ZGEMMONCOPY = zgemm_ncopy_4_rvv.c
|
||||
ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
|
||||
|
||||
TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c
|
||||
|
||||
SSYMV_U_KERNEL = symv_U_rvv.c
|
||||
SSYMV_L_KERNEL = symv_L_rvv.c
|
||||
DSYMV_U_KERNEL = symv_U_rvv.c
|
||||
DSYMV_L_KERNEL = symv_L_rvv.c
|
||||
CSYMV_U_KERNEL = zsymv_U_rvv.c
|
||||
CSYMV_L_KERNEL = zsymv_L_rvv.c
|
||||
ZSYMV_U_KERNEL = zsymv_U_rvv.c
|
||||
ZSYMV_L_KERNEL = zsymv_L_rvv.c
|
||||
|
||||
CHEMV_L_KERNEL = zhemv_LM_rvv.c
|
||||
CHEMV_M_KERNEL = zhemv_LM_rvv.c
|
||||
CHEMV_U_KERNEL = zhemv_UV_rvv.c
|
||||
CHEMV_V_KERNEL = zhemv_UV_rvv.c
|
||||
ZHEMV_L_KERNEL = zhemv_LM_rvv.c
|
||||
ZHEMV_M_KERNEL = zhemv_LM_rvv.c
|
||||
ZHEMV_U_KERNEL = zhemv_UV_rvv.c
|
||||
ZHEMV_V_KERNEL = zhemv_UV_rvv.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c
|
||||
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
SGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
ifndef DGEMM_BETA
|
||||
DGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
ifndef CGEMM_BETA
|
||||
CGEMM_BETA = zgemm_beta_rvv.c
|
||||
endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = zgemm_beta_rvv.c
|
||||
endif
|
||||
102
kernel/riscv64/amax_rvv.c
Normal file
102
kernel/riscv64/amax_rvv.c
Normal file
@@ -0,0 +1,102 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
FLOAT_V_T vx, vmax;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmax = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(maxf);
|
||||
}
|
||||
@@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
@@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
FLOAT maxf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_max;
|
||||
FLOAT_V_T_M1 v_res, v_zero;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T v0, v1;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
FLOAT zero = 0.0;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
v_max = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
//maxf = v_res[0];
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
@@ -169,94 +99,27 @@ asm volatile(
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
v_max = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
ix += inc_xv*2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
|
||||
102
kernel/riscv64/amin_rvv.c
Normal file
102
kernel/riscv64/amin_rvv.c
Normal file
@@ -0,0 +1,102 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
FLOAT_V_T vx, vmin;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(minf);
|
||||
}
|
||||
@@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define ABS fabs
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define ABS fabsf
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define ABS fabs
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define ABS fabsf
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
FLOAT minf=FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_min;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
minf = ABS(*x);
|
||||
x += inc_x;
|
||||
--n;
|
||||
if (n == 0) return(minf);
|
||||
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(minf, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
FLOAT zero = 0.0;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG idx = 0, inc_xv = inc_x * gvl;
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
idx += inc_xv*2;
|
||||
ix += inc_xv*2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
return(minf);
|
||||
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
return(minf);
|
||||
}
|
||||
|
||||
|
||||
|
||||
99
kernel/riscv64/asum_rvv.c
Normal file
99
kernel/riscv64/asum_rvv.c
Normal file
@@ -0,0 +1,99 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT asumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
|
||||
FLOAT_V_T vx, vsum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vsum = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax);
|
||||
asumf = VFMVFS_FLOAT_M1(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
@@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT asumf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_zero,v_sum;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T v0, v1, v_sum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
inc_xv += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
asumf = EXTRACT_FLOAT(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix,iy;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
|
||||
173
kernel/riscv64/axpby_rvv.c
Normal file
173
kernel/riscv64/axpby_rvv.c
Normal file
@@ -0,0 +1,173 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
FLOAT_V_T vx, vy;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( beta == 0.0 ) {
|
||||
if ( alpha == 0.0 ) {
|
||||
if (1 == inc_y) {
|
||||
memset(&y[0], 0, n * sizeof(FLOAT));
|
||||
} else {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
size_t vl = VSETVL(n);
|
||||
vy = VFMVVF_FLOAT(0.0, vl);
|
||||
for ( ; n > 0; n -= vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ((1 == inc_x) && (1 == inc_y)) {
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_x) {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_y) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ( alpha == 0.0 ) {
|
||||
if (1 == inc_y) {
|
||||
for (size_t vl; n > 0; n -= vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ((1 == inc_x) && (1 == inc_y)) {
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_x) {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_y) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
@@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
# define LMUL m4
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _)
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
if (n < 0) return(0);
|
||||
if (n <= 0) return(0);
|
||||
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int gvl = 0;
|
||||
|
||||
@@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix,iy;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( n <= 0 ) return(0);
|
||||
if ( da == 0.0 ) return(0);
|
||||
|
||||
ix = 0;
|
||||
|
||||
109
kernel/riscv64/axpy_rvv.c
Normal file
109
kernel/riscv64/axpy_rvv.c
Normal file
@@ -0,0 +1,109 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
if ( n <= 0 ) return(0);
|
||||
if ( da == 0.0 ) return(0);
|
||||
|
||||
FLOAT_V_T vx, vy;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
|
||||
} else if (1 == inc_y) {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSEV_FLOAT(y, vy, vl);
|
||||
}
|
||||
|
||||
} else if (1 == inc_x) {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
@@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
# define LMUL m4
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _)
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0, j=0, jx=0, jy=0;
|
||||
|
||||
996
kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
Normal file
996
kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
Normal file
@@ -0,0 +1,996 @@
|
||||
/*
|
||||
|
||||
AUTOGENERATED KERNEL
|
||||
Script: ./kernel/riscv64/generate_kernel.py
|
||||
Settings:
|
||||
LMUL=2
|
||||
M=8
|
||||
M_tail_scalar_from=2
|
||||
N=4
|
||||
__riscv_='__riscv_'
|
||||
complex=True
|
||||
conjugate=False
|
||||
cpu='zvl128b'
|
||||
force_acc_double=False
|
||||
index_type='BLASLONG'
|
||||
op='gemm'
|
||||
param_precision='float'
|
||||
reg_width_bits=128
|
||||
tail_policy=''
|
||||
trace=False
|
||||
|
||||
Derived:
|
||||
ELEN_ACC=32
|
||||
ELEN_PARAM=32
|
||||
LMUL_ACC=2
|
||||
VFMACC='__riscv_vfmacc_vf_f32m2'
|
||||
VFMUL='__riscv_vfmul_vf_f32m2'
|
||||
VLEV='__riscv_vle32_v_f32m2'
|
||||
VLSEV='__riscv_vlse32_v_f32m2'
|
||||
VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
|
||||
VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
|
||||
VSETVL='__riscv_vsetvl_e32m2'
|
||||
VSEV='__riscv_vse32_v_f32m2'
|
||||
VSSEV='__riscv_vsse32_v_f32m2'
|
||||
acc_vector_t='vfloat32m2_t'
|
||||
output='cgemm_kernel_8x4_zvl128b.c'
|
||||
param_scalar_t='float'
|
||||
param_vector_t='vfloat32m2_t'
|
||||
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define S0 1
|
||||
#define S1 -1
|
||||
#define S2 1
|
||||
#define S3 1
|
||||
#define VFMACC_RR __riscv_vfmsac
|
||||
#define VFMACC_RI __riscv_vfmacc
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define S0 1
|
||||
#define S1 1
|
||||
#define S2 1
|
||||
#define S3 -1
|
||||
#define VFMACC_RR __riscv_vfmacc
|
||||
#define VFMACC_RI __riscv_vfmsac
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define S0 1
|
||||
#define S1 1
|
||||
#define S2 -1
|
||||
#define S3 1
|
||||
#define VFMACC_RR __riscv_vfmacc
|
||||
#define VFMACC_RI __riscv_vfnmsac
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define S0 1
|
||||
#define S1 -1
|
||||
#define S2 -1
|
||||
#define S3 -1
|
||||
#define VFMACC_RR __riscv_vfmsac
|
||||
#define VFMACC_RI __riscv_vfnmacc
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
|
||||
|
||||
{
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
|
||||
// -- MAIN PASS
|
||||
|
||||
for (BLASLONG j = 0; j < N / 4; j += 1) {
|
||||
m_top = 0;
|
||||
BLASLONG gvl = __riscv_vsetvl_e32m2(8);
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
float B0r = B[bi + 0 * 2 + 0];
|
||||
float B0i = B[bi + 0 * 2 + 1];
|
||||
float B1r = B[bi + 1 * 2 + 0];
|
||||
float B1i = B[bi + 1 * 2 + 1];
|
||||
float B2r = B[bi + 2 * 2 + 0];
|
||||
float B2i = B[bi + 2 * 2 + 1];
|
||||
float B3r = B[bi + 3 * 2 + 0];
|
||||
float B3i = B[bi + 3 * 2 + 1];
|
||||
bi += 4 * 2;
|
||||
|
||||
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 8 * 2;
|
||||
|
||||
// 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
|
||||
// leaving 6 vector registers for temporaries
|
||||
// performing 2 operations between reuses of temporaries
|
||||
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
vfloat32m2_t ACC0r = tmp0r;
|
||||
vfloat32m2_t ACC0i = tmp0i;
|
||||
vfloat32m2_t ACC1r = tmp1r;
|
||||
vfloat32m2_t ACC1i = tmp1i;
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
||||
vfloat32m2_t ACC2r = tmp0r;
|
||||
vfloat32m2_t ACC2i = tmp0i;
|
||||
vfloat32m2_t ACC3r = tmp1r;
|
||||
vfloat32m2_t ACC3i = tmp1i;
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0r = B[bi + 0 * 2 + 0];
|
||||
B0i = B[bi + 0 * 2 + 1];
|
||||
B1r = B[bi + 1 * 2 + 0];
|
||||
B1i = B[bi + 1 * 2 + 1];
|
||||
B2r = B[bi + 2 * 2 + 0];
|
||||
B2i = B[bi + 2 * 2 + 1];
|
||||
B3r = B[bi + 3 * 2 + 0];
|
||||
B3i = B[bi + 3 * 2 + 1];
|
||||
bi += 4 * 2;
|
||||
|
||||
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 8 * 2;
|
||||
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
||||
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
||||
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
||||
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
||||
ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
|
||||
ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
|
||||
ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
|
||||
ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
|
||||
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
||||
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
||||
C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
|
||||
C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
|
||||
C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
|
||||
C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
|
||||
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
||||
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
||||
C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
|
||||
C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
|
||||
C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
|
||||
C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
|
||||
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
// -- tails for main pass
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
float B0r = B[bi + 0 * 2 + 0];
|
||||
float B0i = B[bi + 0 * 2 + 1];
|
||||
float B1r = B[bi + 1 * 2 + 0];
|
||||
float B1i = B[bi + 1 * 2 + 1];
|
||||
float B2r = B[bi + 2 * 2 + 0];
|
||||
float B2i = B[bi + 2 * 2 + 1];
|
||||
float B3r = B[bi + 3 * 2 + 0];
|
||||
float B3i = B[bi + 3 * 2 + 1];
|
||||
bi += 4 * 2;
|
||||
|
||||
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 4 * 2;
|
||||
|
||||
// 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
|
||||
// leaving 6 vector registers for temporaries
|
||||
// performing 2 operations between reuses of temporaries
|
||||
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
vfloat32m2_t ACC0r = tmp0r;
|
||||
vfloat32m2_t ACC0i = tmp0i;
|
||||
vfloat32m2_t ACC1r = tmp1r;
|
||||
vfloat32m2_t ACC1i = tmp1i;
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
||||
vfloat32m2_t ACC2r = tmp0r;
|
||||
vfloat32m2_t ACC2i = tmp0i;
|
||||
vfloat32m2_t ACC3r = tmp1r;
|
||||
vfloat32m2_t ACC3i = tmp1i;
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0r = B[bi + 0 * 2 + 0];
|
||||
B0i = B[bi + 0 * 2 + 1];
|
||||
B1r = B[bi + 1 * 2 + 0];
|
||||
B1i = B[bi + 1 * 2 + 1];
|
||||
B2r = B[bi + 2 * 2 + 0];
|
||||
B2i = B[bi + 2 * 2 + 1];
|
||||
B3r = B[bi + 3 * 2 + 0];
|
||||
B3i = B[bi + 3 * 2 + 1];
|
||||
bi += 4 * 2;
|
||||
|
||||
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 4 * 2;
|
||||
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
||||
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
||||
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
||||
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
||||
ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
|
||||
ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
|
||||
ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
|
||||
ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
|
||||
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
||||
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
||||
C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
|
||||
C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
|
||||
C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
|
||||
C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
|
||||
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
||||
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
||||
C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
|
||||
C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
|
||||
C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
|
||||
C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
|
||||
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
float result8 = 0;
|
||||
float result9 = 0;
|
||||
float result10 = 0;
|
||||
float result11 = 0;
|
||||
float result12 = 0;
|
||||
float result13 = 0;
|
||||
float result14 = 0;
|
||||
float result15 = 0;
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
||||
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
||||
result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
|
||||
result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
|
||||
result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
||||
result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
||||
result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
|
||||
result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
|
||||
result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
|
||||
result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
|
||||
result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
|
||||
result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
|
||||
result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
|
||||
result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
|
||||
result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
|
||||
result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
|
||||
ai += 2 * 2;
|
||||
bi += 4 * 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
float Cr, Ci;
|
||||
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
||||
Cr += result0 * alphar;
|
||||
Ci += result1 * alphar;
|
||||
Cr -= result1 * alphai;
|
||||
Ci += result0 * alphai;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
|
||||
Cr += result2 * alphar;
|
||||
Ci += result3 * alphar;
|
||||
Cr -= result3 * alphai;
|
||||
Ci += result2 * alphai;
|
||||
C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
||||
Cr += result4 * alphar;
|
||||
Ci += result5 * alphar;
|
||||
Cr -= result5 * alphai;
|
||||
Ci += result4 * alphai;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
|
||||
Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
|
||||
Cr += result6 * alphar;
|
||||
Ci += result7 * alphar;
|
||||
Cr -= result7 * alphai;
|
||||
Ci += result6 * alphai;
|
||||
C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
|
||||
C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
|
||||
Cr += result8 * alphar;
|
||||
Ci += result9 * alphar;
|
||||
Cr -= result9 * alphai;
|
||||
Ci += result8 * alphai;
|
||||
C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
|
||||
Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
|
||||
Cr += result10 * alphar;
|
||||
Ci += result11 * alphar;
|
||||
Cr -= result11 * alphai;
|
||||
Ci += result10 * alphai;
|
||||
C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
|
||||
C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
|
||||
Cr += result12 * alphar;
|
||||
Ci += result13 * alphar;
|
||||
Cr -= result13 * alphai;
|
||||
Ci += result12 * alphai;
|
||||
C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
|
||||
Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
|
||||
Cr += result14 * alphar;
|
||||
Ci += result15 * alphar;
|
||||
Cr -= result15 * alphai;
|
||||
Ci += result14 * alphai;
|
||||
C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
|
||||
C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
||||
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
||||
result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
||||
result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
||||
result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
|
||||
result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
|
||||
result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
|
||||
result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
|
||||
ai += 1 * 2;
|
||||
bi += 4 * 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
float Cr, Ci;
|
||||
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
||||
Cr += result0 * alphar;
|
||||
Ci += result1 * alphar;
|
||||
Cr -= result1 * alphai;
|
||||
Ci += result0 * alphai;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
||||
Cr += result2 * alphar;
|
||||
Ci += result3 * alphar;
|
||||
Cr -= result3 * alphai;
|
||||
Ci += result2 * alphai;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
|
||||
Cr += result4 * alphar;
|
||||
Ci += result5 * alphar;
|
||||
Cr -= result5 * alphai;
|
||||
Ci += result4 * alphai;
|
||||
C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
|
||||
Cr += result6 * alphar;
|
||||
Ci += result7 * alphar;
|
||||
Cr -= result7 * alphai;
|
||||
Ci += result6 * alphai;
|
||||
C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 4;
|
||||
}
|
||||
|
||||
// -- tails for N=2
|
||||
|
||||
if (N & 2) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
float B0r = B[bi + 0 * 2 + 0];
|
||||
float B0i = B[bi + 0 * 2 + 1];
|
||||
float B1r = B[bi + 1 * 2 + 0];
|
||||
float B1i = B[bi + 1 * 2 + 1];
|
||||
bi += 2 * 2;
|
||||
|
||||
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 8 * 2;
|
||||
|
||||
// 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
|
||||
// leaving 10 vector registers for temporaries
|
||||
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
vfloat32m2_t ACC0r = tmp0r;
|
||||
vfloat32m2_t ACC0i = tmp0i;
|
||||
vfloat32m2_t ACC1r = tmp1r;
|
||||
vfloat32m2_t ACC1i = tmp1i;
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0r = B[bi + 0 * 2 + 0];
|
||||
B0i = B[bi + 0 * 2 + 1];
|
||||
B1r = B[bi + 1 * 2 + 0];
|
||||
B1i = B[bi + 1 * 2 + 1];
|
||||
bi += 2 * 2;
|
||||
|
||||
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 8 * 2;
|
||||
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
||||
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
||||
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
||||
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
|
||||
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
||||
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
||||
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
||||
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
||||
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
float B0r = B[bi + 0 * 2 + 0];
|
||||
float B0i = B[bi + 0 * 2 + 1];
|
||||
float B1r = B[bi + 1 * 2 + 0];
|
||||
float B1i = B[bi + 1 * 2 + 1];
|
||||
bi += 2 * 2;
|
||||
|
||||
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 4 * 2;
|
||||
|
||||
// 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
|
||||
// leaving 10 vector registers for temporaries
|
||||
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
vfloat32m2_t ACC0r = tmp0r;
|
||||
vfloat32m2_t ACC0i = tmp0i;
|
||||
vfloat32m2_t ACC1r = tmp1r;
|
||||
vfloat32m2_t ACC1i = tmp1i;
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0r = B[bi + 0 * 2 + 0];
|
||||
B0i = B[bi + 0 * 2 + 1];
|
||||
B1r = B[bi + 1 * 2 + 0];
|
||||
B1i = B[bi + 1 * 2 + 1];
|
||||
bi += 2 * 2;
|
||||
|
||||
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 4 * 2;
|
||||
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
||||
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
||||
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
||||
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
||||
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
||||
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
||||
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
|
||||
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
||||
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
||||
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
||||
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
||||
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
||||
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
||||
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
||||
result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
|
||||
result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
|
||||
result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
||||
result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
||||
result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
|
||||
result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
|
||||
ai += 2 * 2;
|
||||
bi += 2 * 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
float Cr, Ci;
|
||||
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
||||
Cr += result0 * alphar;
|
||||
Ci += result1 * alphar;
|
||||
Cr -= result1 * alphai;
|
||||
Ci += result0 * alphai;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
|
||||
Cr += result2 * alphar;
|
||||
Ci += result3 * alphar;
|
||||
Cr -= result3 * alphai;
|
||||
Ci += result2 * alphai;
|
||||
C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
||||
Cr += result4 * alphar;
|
||||
Ci += result5 * alphar;
|
||||
Cr -= result5 * alphai;
|
||||
Ci += result4 * alphai;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
|
||||
Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
|
||||
Cr += result6 * alphar;
|
||||
Ci += result7 * alphar;
|
||||
Cr -= result7 * alphai;
|
||||
Ci += result6 * alphai;
|
||||
C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
|
||||
C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
||||
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
||||
result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
||||
result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
||||
ai += 1 * 2;
|
||||
bi += 2 * 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
float Cr, Ci;
|
||||
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
||||
Cr += result0 * alphar;
|
||||
Ci += result1 * alphar;
|
||||
Cr -= result1 * alphai;
|
||||
Ci += result0 * alphai;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
||||
Cr += result2 * alphar;
|
||||
Ci += result3 * alphar;
|
||||
Cr -= result3 * alphai;
|
||||
Ci += result2 * alphai;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 2;
|
||||
}
|
||||
|
||||
// -- tails for N=1
|
||||
|
||||
if (N & 1) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
float B0r = B[bi + 0 * 2 + 0];
|
||||
float B0i = B[bi + 0 * 2 + 1];
|
||||
bi += 1 * 2;
|
||||
|
||||
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 8 * 2;
|
||||
|
||||
// 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
|
||||
// leaving 12 vector registers for temporaries
|
||||
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
vfloat32m2_t ACC0r = tmp0r;
|
||||
vfloat32m2_t ACC0i = tmp0i;
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0r = B[bi + 0 * 2 + 0];
|
||||
B0i = B[bi + 0 * 2 + 1];
|
||||
bi += 1 * 2;
|
||||
|
||||
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 8 * 2;
|
||||
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
||||
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
|
||||
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
||||
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
||||
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
float B0r = B[bi + 0 * 2 + 0];
|
||||
float B0i = B[bi + 0 * 2 + 1];
|
||||
bi += 1 * 2;
|
||||
|
||||
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 4 * 2;
|
||||
|
||||
// 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
|
||||
// leaving 12 vector registers for temporaries
|
||||
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
vfloat32m2_t ACC0r = tmp0r;
|
||||
vfloat32m2_t ACC0i = tmp0i;
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0r = B[bi + 0 * 2 + 0];
|
||||
B0i = B[bi + 0 * 2 + 1];
|
||||
bi += 1 * 2;
|
||||
|
||||
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
||||
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
ai += 4 * 2;
|
||||
|
||||
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
||||
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
||||
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
||||
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
||||
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
||||
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
||||
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
||||
|
||||
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
||||
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
||||
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
||||
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
||||
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
||||
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
||||
result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
|
||||
result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
|
||||
ai += 2 * 2;
|
||||
bi += 1 * 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
float Cr, Ci;
|
||||
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
||||
Cr += result0 * alphar;
|
||||
Ci += result1 * alphar;
|
||||
Cr -= result1 * alphai;
|
||||
Ci += result0 * alphai;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
||||
Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
|
||||
Cr += result2 * alphar;
|
||||
Ci += result3 * alphar;
|
||||
Cr -= result3 * alphai;
|
||||
Ci += result2 * alphai;
|
||||
C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
BLASLONG ai = m_top * K * 2;
|
||||
BLASLONG bi = n_top * K * 2;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
||||
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
||||
ai += 1 * 2;
|
||||
bi += 1 * 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
float Cr, Ci;
|
||||
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
||||
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
||||
Cr += result0 * alphar;
|
||||
Ci += result1 * alphar;
|
||||
Cr -= result1 * alphai;
|
||||
Ci += result0 * alphai;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
||||
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
1931
kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
Normal file
1931
kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
94
kernel/riscv64/copy_rvv.c
Normal file
94
kernel/riscv64/copy_rvv.c
Normal file
@@ -0,0 +1,94 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
if(n <= 0) return(0);
|
||||
|
||||
FLOAT_V_T v0;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
VSEV_FLOAT(y, v0, vl);
|
||||
}
|
||||
|
||||
} else if (inc_y == 1) {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
VSEV_FLOAT(y, v0, vl);
|
||||
}
|
||||
|
||||
} else if(inc_x == 1) {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
@@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL)
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
@@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/4){
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
BLASLONG gvl3 = gvl * 3;
|
||||
unsigned int gvl3 = gvl * 3;
|
||||
BLASLONG inc_xv3 = inc_xv * 3;
|
||||
for(i=0,j=0; i<n/(4*gvl); i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
@@ -86,7 +99,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
if(gvl <= n/4){
|
||||
BLASLONG inc_yv = inc_y * gvl;
|
||||
BLASLONG inc_yv3 = inc_yv * 3;
|
||||
BLASLONG gvl3 = gvl * 3;
|
||||
unsigned int gvl3 = gvl * 3;
|
||||
for(i=0,j=0; i<n/(4*gvl); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
|
||||
|
||||
1102
kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
Normal file
1102
kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
Normal file
File diff suppressed because it is too large
Load Diff
2007
kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
Normal file
2007
kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e64,m1 \n\t"
|
||||
"fmv.w.x ft11, zero \n\t"
|
||||
"fmv.d.x ft11, zero \n\t"
|
||||
"mv t0, %[BK] \n\t"
|
||||
|
||||
"vfmv.v.f v16, ft11 \n\t"
|
||||
|
||||
492
kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
Normal file
492
kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
Normal file
@@ -0,0 +1,492 @@
|
||||
/*
|
||||
|
||||
AUTOGENERATED KERNEL
|
||||
Script: ./kernel/riscv64/generate_kernel.py
|
||||
Settings:
|
||||
LMUL=4
|
||||
M=8
|
||||
M_tail_scalar_from=2
|
||||
N=4
|
||||
__riscv_='__riscv_'
|
||||
complex=False
|
||||
conjugate=False
|
||||
cpu='zvl128b'
|
||||
force_acc_double=False
|
||||
index_type='BLASLONG'
|
||||
op='gemm'
|
||||
param_precision='double'
|
||||
reg_width_bits=128
|
||||
tail_policy=''
|
||||
trace=False
|
||||
|
||||
Derived:
|
||||
ELEN_ACC=64
|
||||
ELEN_PARAM=64
|
||||
LMUL_ACC=4
|
||||
VFMACC='__riscv_vfmacc_vf_f64m4'
|
||||
VFMUL='__riscv_vfmul_vf_f64m4'
|
||||
VLEV='__riscv_vle64_v_f64m4'
|
||||
VLSEV='__riscv_vlse64_v_f64m4'
|
||||
VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
|
||||
VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
|
||||
VSETVL='__riscv_vsetvl_e64m4'
|
||||
VSEV='__riscv_vse64_v_f64m4'
|
||||
VSSEV='__riscv_vsse64_v_f64m4'
|
||||
acc_vector_t='vfloat64m4_t'
|
||||
output='dgemm_kernel_8x4_zvl128b.c'
|
||||
param_scalar_t='double'
|
||||
param_vector_t='vfloat64m4_t'
|
||||
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
|
||||
|
||||
{
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
|
||||
// -- MAIN PASS
|
||||
|
||||
for (BLASLONG j = 0; j < N / 4; j += 1) {
|
||||
m_top = 0;
|
||||
BLASLONG gvl = __riscv_vsetvl_e64m4(8);
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
double B2 = B[bi + 2];
|
||||
double B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
|
||||
vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c3, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
// -- tails for main pass
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e64m4(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
double B2 = B[bi + 2];
|
||||
double B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
|
||||
vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c3, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
result4 += A[ai + 0] * B[bi + 2];
|
||||
result5 += A[ai + 1] * B[bi + 2];
|
||||
result6 += A[ai + 0] * B[bi + 3];
|
||||
result7 += A[ai + 1] * B[bi + 3];
|
||||
ai += 2;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
C[ci + 1 * ldc + 0] += alpha * result2;
|
||||
C[ci + 1 * ldc + 1] += alpha * result3;
|
||||
C[ci + 2 * ldc + 0] += alpha * result4;
|
||||
C[ci + 2 * ldc + 1] += alpha * result5;
|
||||
C[ci + 3 * ldc + 0] += alpha * result6;
|
||||
C[ci + 3 * ldc + 1] += alpha * result7;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
result2 += A[ai + 0] * B[bi + 2];
|
||||
result3 += A[ai + 0] * B[bi + 3];
|
||||
ai += 1;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
C[ci + 2 * ldc + 0] += alpha * result2;
|
||||
C[ci + 3 * ldc + 0] += alpha * result3;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 4;
|
||||
}
|
||||
|
||||
// -- tails for N=2
|
||||
|
||||
if (N & 2) {
|
||||
gvl = __riscv_vsetvl_e64m4(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e64m4(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
ai += 2;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
C[ci + 1 * ldc + 0] += alpha * result2;
|
||||
C[ci + 1 * ldc + 1] += alpha * result3;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
ai += 1;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 2;
|
||||
}
|
||||
|
||||
// -- tails for N=1
|
||||
|
||||
if (N & 1) {
|
||||
gvl = __riscv_vsetvl_e64m4(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
double B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e64m4(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
double B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
ai += 2;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
double result0 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
ai += 1;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
860
kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
Normal file
860
kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
Normal file
@@ -0,0 +1,860 @@
|
||||
/*
|
||||
|
||||
AUTOGENERATED KERNEL
|
||||
Settings:
|
||||
LMUL=1
|
||||
M=8
|
||||
M_tail_scalar_from=2
|
||||
N=8
|
||||
__riscv_='__riscv_'
|
||||
complex=False
|
||||
conjugate=False
|
||||
cpu='zvl256b'
|
||||
force_acc_double=False
|
||||
index_type='BLASLONG'
|
||||
op='gemm'
|
||||
param_precision='double'
|
||||
reg_width_bits=256
|
||||
tail_policy=''
|
||||
trace=False
|
||||
|
||||
Derived:
|
||||
ELEN_ACC=64
|
||||
ELEN_PARAM=64
|
||||
LMUL_ACC=1
|
||||
VFMACC='__riscv_vfmacc_vf_f64m1'
|
||||
VFMUL='__riscv_vfmul_vf_f64m1'
|
||||
VLEV='__riscv_vle64_v_f64m1'
|
||||
VLSEV='__riscv_vlse64_v_f64m1'
|
||||
VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
|
||||
VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
|
||||
VSETVL='__riscv_vsetvl_e64m1'
|
||||
VSEV='__riscv_vse64_v_f64m1'
|
||||
VSSEV='__riscv_vsse64_v_f64m1'
|
||||
acc_vector_t='vfloat64m1_t'
|
||||
output='dgemm_kernel_8x8_zvl256b.c'
|
||||
param_scalar_t='double'
|
||||
param_vector_t='vfloat64m1_t'
|
||||
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
|
||||
|
||||
{
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
|
||||
|
||||
// -- MAIN PASS
|
||||
|
||||
for (BLASLONG j=0; j<N/8; j+=1) {
|
||||
m_top = 0;
|
||||
BLASLONG gvl = __riscv_vsetvl_e64m1(4);
|
||||
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
double B4 = B[bi+4];
|
||||
double B5 = B[bi+5];
|
||||
double B6 = B[bi+6];
|
||||
double B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
|
||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
|
||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
|
||||
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
|
||||
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
|
||||
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
|
||||
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
|
||||
vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
|
||||
vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
|
||||
vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
|
||||
vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
|
||||
vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
|
||||
vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
|
||||
vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
|
||||
vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
B4 = B[bi+4];
|
||||
B5 = B[bi+5];
|
||||
B6 = B[bi+6];
|
||||
B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
|
||||
result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
|
||||
result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
|
||||
result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
|
||||
result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
|
||||
result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
|
||||
result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
|
||||
result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
|
||||
result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
|
||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
|
||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
|
||||
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
|
||||
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
|
||||
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
|
||||
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
|
||||
c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl );
|
||||
c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl );
|
||||
c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl );
|
||||
c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl );
|
||||
c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl );
|
||||
c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl );
|
||||
c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl );
|
||||
c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c15, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for main pass
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = __riscv_vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
double B4 = B[bi+4];
|
||||
double B5 = B[bi+5];
|
||||
double B6 = B[bi+6];
|
||||
double B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
|
||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
|
||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
|
||||
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
|
||||
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
|
||||
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
|
||||
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
B4 = B[bi+4];
|
||||
B5 = B[bi+5];
|
||||
B6 = B[bi+6];
|
||||
B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
|
||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
|
||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
|
||||
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
|
||||
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
|
||||
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
|
||||
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c7, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
double result8 = 0;
|
||||
double result9 = 0;
|
||||
double result10 = 0;
|
||||
double result11 = 0;
|
||||
double result12 = 0;
|
||||
double result13 = 0;
|
||||
double result14 = 0;
|
||||
double result15 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
result2+=A[ai+0]*B[bi+1];
|
||||
result3+=A[ai+1]*B[bi+1];
|
||||
result4+=A[ai+0]*B[bi+2];
|
||||
result5+=A[ai+1]*B[bi+2];
|
||||
result6+=A[ai+0]*B[bi+3];
|
||||
result7+=A[ai+1]*B[bi+3];
|
||||
result8+=A[ai+0]*B[bi+4];
|
||||
result9+=A[ai+1]*B[bi+4];
|
||||
result10+=A[ai+0]*B[bi+5];
|
||||
result11+=A[ai+1]*B[bi+5];
|
||||
result12+=A[ai+0]*B[bi+6];
|
||||
result13+=A[ai+1]*B[bi+6];
|
||||
result14+=A[ai+0]*B[bi+7];
|
||||
result15+=A[ai+1]*B[bi+7];
|
||||
ai+=2;
|
||||
bi+=8;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
C[ci+1*ldc+0] += alpha * result2;
|
||||
C[ci+1*ldc+1] += alpha * result3;
|
||||
C[ci+2*ldc+0] += alpha * result4;
|
||||
C[ci+2*ldc+1] += alpha * result5;
|
||||
C[ci+3*ldc+0] += alpha * result6;
|
||||
C[ci+3*ldc+1] += alpha * result7;
|
||||
C[ci+4*ldc+0] += alpha * result8;
|
||||
C[ci+4*ldc+1] += alpha * result9;
|
||||
C[ci+5*ldc+0] += alpha * result10;
|
||||
C[ci+5*ldc+1] += alpha * result11;
|
||||
C[ci+6*ldc+0] += alpha * result12;
|
||||
C[ci+6*ldc+1] += alpha * result13;
|
||||
C[ci+7*ldc+0] += alpha * result14;
|
||||
C[ci+7*ldc+1] += alpha * result15;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+0]*B[bi+1];
|
||||
result2+=A[ai+0]*B[bi+2];
|
||||
result3+=A[ai+0]*B[bi+3];
|
||||
result4+=A[ai+0]*B[bi+4];
|
||||
result5+=A[ai+0]*B[bi+5];
|
||||
result6+=A[ai+0]*B[bi+6];
|
||||
result7+=A[ai+0]*B[bi+7];
|
||||
ai+=1;
|
||||
bi+=8;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+1*ldc+0] += alpha * result1;
|
||||
C[ci+2*ldc+0] += alpha * result2;
|
||||
C[ci+3*ldc+0] += alpha * result3;
|
||||
C[ci+4*ldc+0] += alpha * result4;
|
||||
C[ci+5*ldc+0] += alpha * result5;
|
||||
C[ci+6*ldc+0] += alpha * result6;
|
||||
C[ci+7*ldc+0] += alpha * result7;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 8;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for N=4
|
||||
|
||||
if( N & 4 ) {
|
||||
gvl = __riscv_vsetvl_e64m1(4);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
|
||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
|
||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
|
||||
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
|
||||
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
|
||||
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
|
||||
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
|
||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
|
||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
|
||||
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
|
||||
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
|
||||
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
|
||||
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c7, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = __riscv_vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
|
||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
|
||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
|
||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
|
||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
result2+=A[ai+0]*B[bi+1];
|
||||
result3+=A[ai+1]*B[bi+1];
|
||||
result4+=A[ai+0]*B[bi+2];
|
||||
result5+=A[ai+1]*B[bi+2];
|
||||
result6+=A[ai+0]*B[bi+3];
|
||||
result7+=A[ai+1]*B[bi+3];
|
||||
ai+=2;
|
||||
bi+=4;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
C[ci+1*ldc+0] += alpha * result2;
|
||||
C[ci+1*ldc+1] += alpha * result3;
|
||||
C[ci+2*ldc+0] += alpha * result4;
|
||||
C[ci+2*ldc+1] += alpha * result5;
|
||||
C[ci+3*ldc+0] += alpha * result6;
|
||||
C[ci+3*ldc+1] += alpha * result7;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+0]*B[bi+1];
|
||||
result2+=A[ai+0]*B[bi+2];
|
||||
result3+=A[ai+0]*B[bi+3];
|
||||
ai+=1;
|
||||
bi+=4;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+1*ldc+0] += alpha * result1;
|
||||
C[ci+2*ldc+0] += alpha * result2;
|
||||
C[ci+3*ldc+0] += alpha * result3;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 4;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for N=2
|
||||
|
||||
if( N & 2 ) {
|
||||
gvl = __riscv_vsetvl_e64m1(4);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
|
||||
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
|
||||
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
|
||||
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
|
||||
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c3, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = __riscv_vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
result2+=A[ai+0]*B[bi+1];
|
||||
result3+=A[ai+1]*B[bi+1];
|
||||
ai+=2;
|
||||
bi+=2;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
C[ci+1*ldc+0] += alpha * result2;
|
||||
C[ci+1*ldc+1] += alpha * result3;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+0]*B[bi+1];
|
||||
ai+=1;
|
||||
bi+=2;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+1*ldc+0] += alpha * result1;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 2;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for N=1
|
||||
|
||||
if( N & 1 ) {
|
||||
gvl = __riscv_vsetvl_e64m1(4);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
__riscv_vse64_v_f64m1( &C[ci], c1, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = __riscv_vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
__riscv_vse64_v_f64m1( &C[ci], c0, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
ai+=2;
|
||||
bi+=1;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
ai+=1;
|
||||
bi+=1;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
126
kernel/riscv64/dot_rvv.c
Normal file
126
kernel/riscv64/dot_rvv.c
Normal file
@@ -0,0 +1,126 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
double dot = 0.0;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
size_t vlmax = __riscv_vsetvlmax_e64m8();
|
||||
vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax);
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = __riscv_vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
|
||||
vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
|
||||
|
||||
vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
|
||||
vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
|
||||
|
||||
vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
|
||||
} else if (1 == inc_x) {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = __riscv_vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
|
||||
vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
|
||||
|
||||
vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
|
||||
vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
|
||||
|
||||
vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
} else if (1 == inc_y) {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = __riscv_vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
|
||||
vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
|
||||
|
||||
vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
|
||||
vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
|
||||
|
||||
vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = __riscv_vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
|
||||
vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
|
||||
|
||||
vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
|
||||
vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
|
||||
|
||||
vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
|
||||
vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax);
|
||||
dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
|
||||
|
||||
return(dot);
|
||||
}
|
||||
@@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
|
||||
#else
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
|
||||
#endif
|
||||
|
||||
#if defined(DSDOT)
|
||||
@@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
@@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
|
||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}else if(inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
int stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
vy = VLEV_FLOAT(&y[j], gvl);
|
||||
@@ -107,9 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
@@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
|
||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
int stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
|
||||
@@ -134,9 +138,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
@@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
|
||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
int stride_x = inc_x * sizeof(FLOAT);
|
||||
int stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
|
||||
@@ -162,9 +164,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
@@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
|
||||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
return(dot);
|
||||
|
||||
152
kernel/riscv64/dsdot_vector.c
Normal file
152
kernel/riscv64/dsdot_vector.c
Normal file
@@ -0,0 +1,152 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
double dot = 0.0 ;
|
||||
|
||||
if ( n < 1 ) return(dot);
|
||||
vfloat64m4_t vr;
|
||||
vfloat32m2_t vx, vy;
|
||||
unsigned int gvl = 0;
|
||||
vfloat64m1_t v_res, v_z0;
|
||||
gvl = vsetvlmax_e64m1();
|
||||
v_res = vfmv_v_f_f64m1(0, gvl);
|
||||
v_z0 = vfmv_v_f_f64m1(0, gvl);
|
||||
|
||||
if(inc_x == 1 && inc_y == 1){
|
||||
gvl = vsetvl_e64m4(n);
|
||||
vr = vfmv_v_f_f64m4(0, gvl);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = vle32_v_f32m2(&x[j], gvl);
|
||||
vy = vle32_v_f32m2(&y[j], gvl);
|
||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = vsetvl_e64m4(n-j);
|
||||
vx = vle32_v_f32m2(&x[j], gvl);
|
||||
vy = vle32_v_f32m2(&y[j], gvl);
|
||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
|
||||
//vr = vfdot_vv_f32m2(vx, vy, gvl);
|
||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
}
|
||||
}else if(inc_y == 1){
|
||||
gvl = vsetvl_e64m4(n);
|
||||
vr = vfmv_v_f_f64m4(0, gvl);
|
||||
int stride_x = inc_x * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
|
||||
vy = vle32_v_f32m2(&y[j], gvl);
|
||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = vsetvl_e64m4(n-j);
|
||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
|
||||
vy = vle32_v_f32m2(&y[j], gvl);
|
||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
|
||||
//vr = vfdot_vv_f32m2(vx, vy, gvl);
|
||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
gvl = vsetvl_e64m4(n);
|
||||
vr = vfmv_v_f_f64m4(0, gvl);
|
||||
int stride_y = inc_y * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = vle32_v_f32m2(&x[j], gvl);
|
||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
|
||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = vsetvl_e64m4(n-j);
|
||||
vx = vle32_v_f32m2(&x[j], gvl);
|
||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
|
||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
|
||||
//vr = vfdot_vv_f32m2(vx, vy, gvl);
|
||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
|
||||
}
|
||||
}else{
|
||||
gvl = vsetvl_e64m4(n);
|
||||
vr = vfmv_v_f_f64m4(0, gvl);
|
||||
int stride_x = inc_x * sizeof(FLOAT);
|
||||
int stride_y = inc_y * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
|
||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
|
||||
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
if(j > 0){
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = vsetvl_e64m4(n-j);
|
||||
vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
|
||||
vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
|
||||
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
|
||||
//vr = vfdot_vv_f32m2(vx, vy, gvl);
|
||||
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
|
||||
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
|
||||
dot += (double)vfmv_f_s_f64m1_f64(v_res);
|
||||
|
||||
}
|
||||
}
|
||||
return(dot);
|
||||
}
|
||||
660
kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
Normal file
660
kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
Normal file
@@ -0,0 +1,660 @@
|
||||
/*
|
||||
|
||||
AUTOGENERATED KERNEL
|
||||
Script: ./kernel/riscv64/generate_kernel.py
|
||||
Settings:
|
||||
LMUL=4
|
||||
M=8
|
||||
M_tail_scalar_from=2
|
||||
N=4
|
||||
__riscv_='__riscv_'
|
||||
complex=False
|
||||
conjugate=False
|
||||
cpu='zvl128b'
|
||||
force_acc_double=False
|
||||
index_type='BLASLONG'
|
||||
op='trmm'
|
||||
param_precision='double'
|
||||
reg_width_bits=128
|
||||
tail_policy=''
|
||||
trace=False
|
||||
|
||||
Derived:
|
||||
ELEN_ACC=64
|
||||
ELEN_PARAM=64
|
||||
LMUL_ACC=4
|
||||
VFMACC='__riscv_vfmacc_vf_f64m4'
|
||||
VFMUL='__riscv_vfmul_vf_f64m4'
|
||||
VLEV='__riscv_vle64_v_f64m4'
|
||||
VLSEV='__riscv_vlse64_v_f64m4'
|
||||
VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
|
||||
VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
|
||||
VSETVL='__riscv_vsetvl_e64m4'
|
||||
VSEV='__riscv_vse64_v_f64m4'
|
||||
VSSEV='__riscv_vsse64_v_f64m4'
|
||||
acc_vector_t='vfloat64m4_t'
|
||||
output='dtrmm_kernel_8x4_zvl128b.c'
|
||||
param_scalar_t='double'
|
||||
param_vector_t='vfloat64m4_t'
|
||||
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(LEFT) != defined(TRANSA)
|
||||
#define BACKWARDS
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
|
||||
|
||||
{
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
|
||||
// -- MAIN PASS
|
||||
|
||||
for (BLASLONG j = 0; j < N / 4; j += 1) {
|
||||
m_top = 0;
|
||||
BLASLONG gvl = __riscv_vsetvl_e64m4(8);
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 8;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 8;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
double B2 = B[bi + 2];
|
||||
double B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
|
||||
vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
|
||||
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
|
||||
vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
|
||||
vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c3, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
// -- tails for main pass
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e64m4(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 4;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 4;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
double B2 = B[bi + 2];
|
||||
double B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
|
||||
vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
|
||||
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
|
||||
vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
|
||||
vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c3, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 2;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 2;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
result4 += A[ai + 0] * B[bi + 2];
|
||||
result5 += A[ai + 1] * B[bi + 2];
|
||||
result6 += A[ai + 0] * B[bi + 3];
|
||||
result7 += A[ai + 1] * B[bi + 3];
|
||||
ai += 2;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 0 * ldc + 1] = alpha * result1;
|
||||
C[ci + 1 * ldc + 0] = alpha * result2;
|
||||
C[ci + 1 * ldc + 1] = alpha * result3;
|
||||
C[ci + 2 * ldc + 0] = alpha * result4;
|
||||
C[ci + 2 * ldc + 1] = alpha * result5;
|
||||
C[ci + 3 * ldc + 0] = alpha * result6;
|
||||
C[ci + 3 * ldc + 1] = alpha * result7;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 1;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 1;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
result2 += A[ai + 0] * B[bi + 2];
|
||||
result3 += A[ai + 0] * B[bi + 3];
|
||||
ai += 1;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 1 * ldc + 0] = alpha * result1;
|
||||
C[ci + 2 * ldc + 0] = alpha * result2;
|
||||
C[ci + 3 * ldc + 0] = alpha * result3;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 4;
|
||||
}
|
||||
|
||||
// -- tails for N=2
|
||||
|
||||
if (N & 2) {
|
||||
gvl = __riscv_vsetvl_e64m4(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 8;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 8;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
|
||||
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e64m4(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 4;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 4;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
double B0 = B[bi + 0];
|
||||
double B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
|
||||
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 2;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 2;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
ai += 2;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 0 * ldc + 1] = alpha * result1;
|
||||
C[ci + 1 * ldc + 0] = alpha * result2;
|
||||
C[ci + 1 * ldc + 1] = alpha * result3;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 1;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 1;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
ai += 1;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 1 * ldc + 0] = alpha * result1;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 2;
|
||||
}
|
||||
|
||||
// -- tails for N=1
|
||||
|
||||
if (N & 1) {
|
||||
gvl = __riscv_vsetvl_e64m4(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 8;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 8;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
double B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e64m4(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 4;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 4;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
double B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
|
||||
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 2;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 2;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
ai += 2;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 0 * ldc + 1] = alpha * result1;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
double result0 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 1;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 1;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
ai += 1;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
1068
kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
Normal file
1068
kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
Normal file
File diff suppressed because it is too large
Load Diff
89
kernel/riscv64/gemm_beta_rvv.c
Normal file
89
kernel/riscv64/gemm_beta_rvv.c
Normal file
@@ -0,0 +1,89 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_beta.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
||||
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
|
||||
FLOAT *c, BLASLONG ldc)
|
||||
{
|
||||
BLASLONG chunk;
|
||||
FLOAT *c_offset;
|
||||
size_t vl;
|
||||
FLOAT_V_T vx;
|
||||
|
||||
if (beta == ZERO) {
|
||||
|
||||
vl = VSETVL(m);
|
||||
vx = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
for( ; n > 0; n--, c += ldc) {
|
||||
c_offset = c;
|
||||
|
||||
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
|
||||
vl = VSETVL(chunk);
|
||||
|
||||
VSEV_FLOAT(c_offset, vx, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for( ; n > 0; n--, c += ldc) {
|
||||
c_offset = c;
|
||||
|
||||
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
|
||||
vl = VSETVL(chunk);
|
||||
|
||||
vx = VLEV_FLOAT(c_offset, vl);
|
||||
vx = VFMULVF_FLOAT(vx, beta, vl);
|
||||
VSEV_FLOAT(c_offset, vx, vl);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
197
kernel/riscv64/gemm_ncopy_8_rvv.c
Normal file
197
kernel/riscv64/gemm_ncopy_8_rvv.c
Normal file
@@ -0,0 +1,197 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
|
||||
#define FLOAT_V_T vfloat32m1_t
|
||||
#define FLOAT_VX2_T vfloat32m1x2_t
|
||||
#define FLOAT_VX4_T vfloat32m1x4_t
|
||||
#define FLOAT_VX8_T vfloat32m1x8_t
|
||||
#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2
|
||||
#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4
|
||||
#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m1
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m1
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
|
||||
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
|
||||
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
|
||||
#define FLOAT_V_T vfloat64m1_t
|
||||
#define FLOAT_VX2_T vfloat64m1x2_t
|
||||
#define FLOAT_VX4_T vfloat64m1x4_t
|
||||
#define FLOAT_VX8_T vfloat64m1x8_t
|
||||
#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2
|
||||
#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4
|
||||
#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m1
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m1
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
|
||||
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
|
||||
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_ncopy_8.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *a_offset;
|
||||
FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8;
|
||||
FLOAT *b_offset;
|
||||
|
||||
FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
|
||||
FLOAT_VX2_T vx2;
|
||||
FLOAT_VX4_T vx4;
|
||||
FLOAT_VX8_T vx8;
|
||||
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for(j = (n >> 3); j > 0; j--) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset5 = a_offset4 + lda;
|
||||
a_offset6 = a_offset5 + lda;
|
||||
a_offset7 = a_offset6 + lda;
|
||||
a_offset8 = a_offset7 + lda;
|
||||
a_offset += 8 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
v3 = VLEV_FLOAT(a_offset3, vl);
|
||||
v4 = VLEV_FLOAT(a_offset4, vl);
|
||||
v5 = VLEV_FLOAT(a_offset5, vl);
|
||||
v6 = VLEV_FLOAT(a_offset6, vl);
|
||||
v7 = VLEV_FLOAT(a_offset7, vl);
|
||||
v8 = VLEV_FLOAT(a_offset8, vl);
|
||||
|
||||
vx8 = VSET_VX8(vx8, 0, v1);
|
||||
vx8 = VSET_VX8(vx8, 1, v2);
|
||||
vx8 = VSET_VX8(vx8, 2, v3);
|
||||
vx8 = VSET_VX8(vx8, 3, v4);
|
||||
vx8 = VSET_VX8(vx8, 4, v5);
|
||||
vx8 = VSET_VX8(vx8, 5, v6);
|
||||
vx8 = VSET_VX8(vx8, 6, v7);
|
||||
vx8 = VSET_VX8(vx8, 7, v8);
|
||||
|
||||
VSSEG8_FLOAT(b_offset, vx8, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
a_offset3 += vl;
|
||||
a_offset4 += vl;
|
||||
a_offset5 += vl;
|
||||
a_offset6 += vl;
|
||||
a_offset7 += vl;
|
||||
a_offset8 += vl;
|
||||
b_offset += vl*8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
v3 = VLEV_FLOAT(a_offset3, vl);
|
||||
v4 = VLEV_FLOAT(a_offset4, vl);
|
||||
|
||||
vx4 = VSET_VX4(vx4, 0, v1);
|
||||
vx4 = VSET_VX4(vx4, 1, v2);
|
||||
vx4 = VSET_VX4(vx4, 2, v3);
|
||||
vx4 = VSET_VX4(vx4, 3, v4);
|
||||
|
||||
VSSEG4_FLOAT(b_offset, vx4, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
a_offset3 += vl;
|
||||
a_offset4 += vl;
|
||||
b_offset += vl*4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
|
||||
vx2 = VSET_VX2(vx2, 0, v1);
|
||||
vx2 = VSET_VX2(vx2, 1, v2);
|
||||
|
||||
VSSEG2_FLOAT(b_offset, vx2, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
b_offset += vl*2;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
a_offset1 = a_offset;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
|
||||
VSEV_FLOAT(b_offset, v1, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
b_offset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
76
kernel/riscv64/gemm_ncopy_rvv_v1.c
Normal file
76
kernel/riscv64/gemm_ncopy_rvv_v1.c
Normal file
@@ -0,0 +1,76 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *a_offset;
|
||||
FLOAT *a_offset1;
|
||||
FLOAT *b_offset;
|
||||
|
||||
FLOAT_V_T v0;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for(j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
a_offset1 = a_offset;
|
||||
a_offset += vl * lda;
|
||||
|
||||
for(i = m; i > 0; i--) {
|
||||
v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(b_offset, v0, vl);
|
||||
|
||||
a_offset1++;
|
||||
b_offset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
273
kernel/riscv64/gemm_tcopy_8_rvv.c
Normal file
273
kernel/riscv64/gemm_tcopy_8_rvv.c
Normal file
@@ -0,0 +1,273 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
|
||||
#define FLOAT_V_T vfloat32m1_t
|
||||
#define FLOAT_VX2_T vfloat32m1x2_t
|
||||
#define FLOAT_VX4_T vfloat32m1x4_t
|
||||
#define FLOAT_VX8_T vfloat32m1x8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m1
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m1
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m1
|
||||
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
|
||||
#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
|
||||
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
|
||||
#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
|
||||
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
|
||||
#define FLOAT_V_T vfloat64m1_t
|
||||
#define FLOAT_VX2_T vfloat64m1x2_t
|
||||
#define FLOAT_VX4_T vfloat64m1x4_t
|
||||
#define FLOAT_VX8_T vfloat64m1x8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m1
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m1
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m1
|
||||
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
|
||||
#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
|
||||
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
|
||||
#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
|
||||
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1;
|
||||
|
||||
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||
|
||||
FLOAT_V_T v0;
|
||||
FLOAT_VX2_T vx2;
|
||||
FLOAT_VX4_T vx4;
|
||||
FLOAT_VX8_T vx8;
|
||||
|
||||
// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
boffset2 = b + m * (n & ~7);
|
||||
boffset3 = b + m * (n & ~3);
|
||||
boffset4 = b + m * (n & ~1);
|
||||
|
||||
for(j = (m >> 3); j > 0; j--) {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 64;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 8;
|
||||
|
||||
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG8_FLOAT(boffset1, vx8, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 8;
|
||||
|
||||
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG4_FLOAT(boffset2, vx4, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
boffset2 += 32;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 8;
|
||||
|
||||
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG2_FLOAT(boffset3, vx2, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
boffset3 += 16;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
size_t vl = 8;
|
||||
|
||||
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(boffset4, v0, vl);
|
||||
|
||||
aoffset1 += 1;
|
||||
boffset4 += 8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 32;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 4;
|
||||
|
||||
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG8_FLOAT(boffset1, vx8, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 4;
|
||||
|
||||
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG4_FLOAT(boffset2, vx4, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
boffset2 += 16;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 4;
|
||||
|
||||
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG2_FLOAT(boffset3, vx2, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
boffset3 += 8;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
size_t vl = 4;
|
||||
|
||||
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(boffset4, v0, vl);
|
||||
|
||||
aoffset1 += 1;
|
||||
boffset4 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset += 2 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 16;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 2;
|
||||
|
||||
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG8_FLOAT(boffset1, vx8, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 2;
|
||||
|
||||
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG4_FLOAT(boffset2, vx4, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
boffset2 += 8;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 2;
|
||||
|
||||
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG2_FLOAT(boffset3, vx2, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
boffset3 += 4;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
size_t vl = 2;
|
||||
|
||||
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(boffset4, v0, vl);
|
||||
|
||||
aoffset1 += 1;
|
||||
boffset4 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
aoffset1 = aoffset;
|
||||
boffset1 = boffset;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 8;
|
||||
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset1, v0, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += 8 * m;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 4;
|
||||
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset2, v0, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
//boffset2 += 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 2;
|
||||
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset3, v0, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
// boffset3 += 2;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
*(boffset4) = *(aoffset1);
|
||||
// aoffset1 ++;
|
||||
// boffset4 ++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
74
kernel/riscv64/gemm_tcopy_rvv_v1.c
Normal file
74
kernel/riscv64/gemm_tcopy_rvv_v1.c
Normal file
@@ -0,0 +1,74 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1;
|
||||
IFLOAT *boffset;
|
||||
|
||||
FLOAT_V_T v0;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
for(j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
aoffset1 = aoffset;
|
||||
aoffset += vl;
|
||||
|
||||
for(i = m; i > 0; i--) {
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset, v0, vl);
|
||||
|
||||
aoffset1 += lda;
|
||||
boffset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
601
kernel/riscv64/gemmkernel_rvv_v1x8.c
Normal file
601
kernel/riscv64/gemmkernel_rvv_v1x8.c
Normal file
@@ -0,0 +1,601 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7;
|
||||
IFLOAT *ptrba,*ptrbb;
|
||||
|
||||
//fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug
|
||||
|
||||
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
|
||||
size_t vl;
|
||||
|
||||
// N:8
|
||||
for (j = bn/8; j > 0; j--) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
C2 = C1 + ldc;
|
||||
C3 = C2 + ldc;
|
||||
C4 = C3 + ldc;
|
||||
C5 = C4 + ldc;
|
||||
C6 = C5 + ldc;
|
||||
C7 = C6 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres4 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vl);
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 8;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
|
||||
ptrbb += 8;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
|
||||
ptrbb += 8;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VLEV_FLOAT(C1, vl);
|
||||
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VLEV_FLOAT(C2, vl);
|
||||
va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VLEV_FLOAT(C3, vl);
|
||||
va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
va4 = VLEV_FLOAT(C4, vl);
|
||||
va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl);
|
||||
VSEV_FLOAT(C4, va4, vl);
|
||||
|
||||
va5 = VLEV_FLOAT(C5, vl);
|
||||
va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl);
|
||||
VSEV_FLOAT(C5, va5, vl);
|
||||
|
||||
va6 = VLEV_FLOAT(C6, vl);
|
||||
va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl);
|
||||
VSEV_FLOAT(C6, va6, vl);
|
||||
|
||||
va7 = VLEV_FLOAT(C7, vl);
|
||||
va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl);
|
||||
VSEV_FLOAT(C7, va7, vl);
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
C4 += vl;
|
||||
C5 += vl;
|
||||
C6 += vl;
|
||||
C7 += vl;
|
||||
}
|
||||
|
||||
bb += (bk<<3);
|
||||
C += (ldc<<3);
|
||||
}
|
||||
|
||||
// N:4
|
||||
if (bn & 4) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
C2 = C1 + ldc;
|
||||
C3 = C2 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 4;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
ptrbb += 4;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
ptrbb += 4;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
ptrbb += 4;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
ptrbb += 4;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
ptrbb += 4;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
ptrbb += 4;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
ptrbb += 4;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
ptrbb += 4;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
|
||||
ptrbb += 4;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VLEV_FLOAT(C1, vl);
|
||||
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VLEV_FLOAT(C2, vl);
|
||||
va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VLEV_FLOAT(C3, vl);
|
||||
va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
}
|
||||
|
||||
bb += (bk<<2);
|
||||
C += (ldc<<2);
|
||||
}
|
||||
|
||||
// N:2
|
||||
if (bn & 2) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 2;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
ptrbb += 2;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
ptrbb += 2;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
ptrbb += 2;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
ptrbb += 2;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
ptrbb += 2;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
ptrbb += 2;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
ptrbb += 2;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
ptrbb += 2;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
|
||||
ptrbb += 2;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VLEV_FLOAT(C1, vl);
|
||||
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
}
|
||||
|
||||
bb += (bk<<1);
|
||||
C += (ldc<<1);
|
||||
}
|
||||
|
||||
// N:1
|
||||
if (bn & 1) {
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 1;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
ptrbb += 1;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
ptrbb += 1;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
ptrbb += 1;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
ptrbb += 1;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
ptrbb += 1;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
ptrbb += 1;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
ptrbb += 1;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
ptrbb += 1;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
|
||||
ptrbb += 1;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
C0 += vl;
|
||||
}
|
||||
|
||||
bb += (bk);
|
||||
C += (ldc);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
94
kernel/riscv64/gemv_n_rvv.c
Normal file
94
kernel/riscv64/gemv_n_rvv.c
Normal file
@@ -0,0 +1,94 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
if(n < 0) return(0);
|
||||
|
||||
FLOAT *a_ptr, *x_ptr;
|
||||
BLASLONG i;
|
||||
FLOAT_V_T va, vy;
|
||||
|
||||
if(inc_y == 1) {
|
||||
|
||||
for (size_t vl; m > 0; m -= vl, y += vl, a += vl) {
|
||||
vl = VSETVL(m);
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
for(i = 0; i < n; i++) {
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
VSEV_FLOAT(y, vy, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) {
|
||||
vl = VSETVL(m);
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
for(i = 0; i < n; i++) {
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
@@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
|
||||
118
kernel/riscv64/gemv_t_rvv.c
Normal file
118
kernel/riscv64/gemv_t_rvv.c
Normal file
@@ -0,0 +1,118 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
|
||||
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
|
||||
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *a_ptr, *x_ptr;
|
||||
|
||||
FLOAT_V_T va, vx, vr;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
size_t vlmax = VSETVL_MAX_M1;
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
vlmax = VSETVL_MAX;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for(i = 0; i < n; i++) {
|
||||
j = m;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vx = VLEV_FLOAT(x_ptr, vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
|
||||
*y += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
y += inc_y;
|
||||
a += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for(i = 0; i < n; i++) {
|
||||
j = m;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vx = VLSEV_FLOAT(x_ptr, stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
|
||||
*y += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
y += inc_y;
|
||||
a += lda;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
@@ -27,107 +27,110 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4
|
||||
#define VFMULVV_FLOAT vfmul_vv_f32m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2)
|
||||
#define xint_t int
|
||||
#else
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4
|
||||
#define VFMULVV_FLOAT vfmul_vv_f64m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2)
|
||||
#define xint_t long long
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i = 0, j = 0, k = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT *a_ptr = a;
|
||||
BLASLONG i = 0, j = 0, k = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT *a_ptr = a;
|
||||
FLOAT temp;
|
||||
|
||||
FLOAT_V_T va, vr, vx;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
|
||||
if(inc_x == 1){
|
||||
for(i = 0; i < n; i++){
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
gvl = VSETVL(m);
|
||||
j = 0;
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
for(k = 0; k < m/gvl; k++){
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp = (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
if(j < m){
|
||||
gvl = VSETVL(m-j);
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp += (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
|
||||
}
|
||||
temp = (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
y[iy] += alpha * temp;
|
||||
|
||||
|
||||
iy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}else{
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for(i = 0; i < n; i++){
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
gvl = VSETVL(m);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
j = 0;
|
||||
ix = 0;
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
for(k = 0; k < m/gvl; k++){
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
ix += inc_x * gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp = (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
if(j < m){
|
||||
gvl = VSETVL(m-j);
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(va, vx, gvl);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp += (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
|
||||
}
|
||||
temp = (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
y[iy] += alpha * temp;
|
||||
|
||||
|
||||
iy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
673
kernel/riscv64/generate_kernel.py
Executable file
673
kernel/riscv64/generate_kernel.py
Executable file
@@ -0,0 +1,673 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import sys, os
|
||||
import contextlib
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def ERROR(*args, **kwargs):
|
||||
print(*args, file=sys.stderr, **kwargs)
|
||||
sys.exit(-1)
|
||||
|
||||
class Target(object):
|
||||
def __init__( self, out, mappings, initial_level=0, tab_width=4 ):
|
||||
self._level = initial_level
|
||||
self._tab_width = tab_width
|
||||
self._out = out
|
||||
self._mappings = mappings
|
||||
|
||||
@contextlib.contextmanager
|
||||
def map( self, **items ):
|
||||
old_mappings = self._mappings
|
||||
self._mappings = dict(old_mappings, **items)
|
||||
yield self._mappings
|
||||
self._mappings = old_mappings
|
||||
|
||||
@contextlib.contextmanager
|
||||
def block( self, start=None, end=None, **args ):
|
||||
with self.map(**args):
|
||||
if start is not None:
|
||||
self.write();
|
||||
self.write(start)
|
||||
self._level += 1
|
||||
yield self._level
|
||||
self._level -= 1
|
||||
if end is not None:
|
||||
self.write(end)
|
||||
self.write()
|
||||
|
||||
def write( self, fmt=None, *args, **kwargs ):
|
||||
if fmt is not None:
|
||||
mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings
|
||||
self._out(self._indent_str() + fmt.format(*args, **mappings))
|
||||
else:
|
||||
self._out("")
|
||||
|
||||
def _indent_str( self ):
|
||||
return ' ' * (self._level * self._tab_width)
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_trmm_block( dest ):
|
||||
dest.write("{index_type} pass_K = K;")
|
||||
dest.write("#ifdef LEFT")
|
||||
with dest.block():
|
||||
dest.write("{index_type} off = offset + m_top;")
|
||||
dest.write("#else")
|
||||
with dest.block():
|
||||
dest.write("{index_type} off = -offset + n_top;")
|
||||
dest.write("#endif")
|
||||
|
||||
dest.write("#ifdef BACKWARDS")
|
||||
with dest.block():
|
||||
dest.write("ai += off*{M}{elt_size};")
|
||||
dest.write("bi += off*{N}{elt_size};")
|
||||
dest.write("pass_K -= off;")
|
||||
dest.write("#else")
|
||||
with dest.block():
|
||||
dest.write("#ifdef LEFT")
|
||||
with dest.block():
|
||||
dest.write("pass_K = off + {M};")
|
||||
dest.write("#else")
|
||||
with dest.block():
|
||||
dest.write("pass_K = off + {N};")
|
||||
dest.write("#endif")
|
||||
dest.write("#endif")
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ):
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
|
||||
|
||||
with dest.map(
|
||||
M=M,
|
||||
N=N,
|
||||
):
|
||||
dest.write("{index_type} ai=m_top*K{elt_size};")
|
||||
dest.write("{index_type} bi=n_top*K{elt_size};")
|
||||
if TRMM:
|
||||
generate_trmm_block( dest )
|
||||
|
||||
for i in range(N):
|
||||
dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i)
|
||||
dest.write("bi += {N};")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
|
||||
dest.write("ai += {M};")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j)
|
||||
|
||||
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
|
||||
for i in range(N):
|
||||
dest.write("B{i} = B[bi+{i}];", i=i )
|
||||
dest.write("bi += {N};")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
|
||||
|
||||
dest.write("ai += {M};")
|
||||
dest.write()
|
||||
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i )
|
||||
|
||||
dest.write()
|
||||
dest.write("{index_type} ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
if narrow_result:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i)
|
||||
|
||||
if not TRMM:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment)
|
||||
|
||||
if narrow_result:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
if TRMM:
|
||||
dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx)
|
||||
else:
|
||||
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx)
|
||||
else:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
if TRMM:
|
||||
dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx)
|
||||
else:
|
||||
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx)
|
||||
|
||||
|
||||
if not TRMM:
|
||||
dest.write()
|
||||
dest.write("ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment)
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ):
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
|
||||
|
||||
if narrow_result:
|
||||
raise RuntimeError("wide accumulator not supported for generated complex kernels")
|
||||
# we could, but we run out of registers really really fast
|
||||
|
||||
with dest.map(
|
||||
M=M,
|
||||
N=N,
|
||||
):
|
||||
dest.write("{index_type} ai=m_top*K*2;")
|
||||
dest.write("{index_type} bi=n_top*K*2;")
|
||||
if TRMM:
|
||||
generate_trmm_block( dest )
|
||||
|
||||
for i in range(N):
|
||||
dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i)
|
||||
dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i)
|
||||
dest.write("bi += {N}*2;")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
dest.write("ai += {M}*2;")
|
||||
dest.write()
|
||||
|
||||
# for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
|
||||
accumulation_regs = a_regs * N
|
||||
dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k",
|
||||
a_regs=a_regs*2, accumulation_regs=accumulation_regs*2
|
||||
)
|
||||
pass_regs = (accumulation_regs + a_regs)*2
|
||||
tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs
|
||||
if tmp_regs < 2:
|
||||
raise RuntimeError("Complex kernel would use too many registers!")
|
||||
|
||||
dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs)
|
||||
|
||||
tmp_unroll_i = min(tmp_regs, a_regs)
|
||||
tmp_unroll_j = N
|
||||
while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j:
|
||||
tmp_unroll_j = int(tmp_unroll_j / 2)
|
||||
|
||||
if tmp_unroll_i < a_regs or tmp_unroll_j < N:
|
||||
dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i)
|
||||
|
||||
for tj in range(0, N, tmp_unroll_j):
|
||||
for ti in range(0, a_regs, tmp_unroll_i):
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
if ti == 0 and tj==0:
|
||||
dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
|
||||
dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
|
||||
else:
|
||||
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
|
||||
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
|
||||
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
|
||||
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;")
|
||||
dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;")
|
||||
|
||||
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
|
||||
for i in range(N):
|
||||
dest.write("B{i}r = B[bi+{i}*2+0];", i=i)
|
||||
dest.write("B{i}i = B[bi+{i}*2+1];", i=i)
|
||||
dest.write("bi += {N}*2;")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
|
||||
dest.write("ai += {M}*2;")
|
||||
dest.write()
|
||||
|
||||
|
||||
for tj in range(0, N, tmp_unroll_j):
|
||||
for ti in range(0, a_regs, tmp_unroll_i):
|
||||
# note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign
|
||||
# so accumulating them directly to ACC would lose precision when ACC is larger
|
||||
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
|
||||
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
|
||||
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);")
|
||||
dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);")
|
||||
|
||||
dest.write()
|
||||
dest.write("{index_type} ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
if TRMM:
|
||||
for i in range(a_regs):
|
||||
with dest.map(idx=j*a_regs+i):
|
||||
dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );")
|
||||
dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );")
|
||||
else:
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
with dest.map(idx=j*a_regs+i, increment=increment):
|
||||
dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );")
|
||||
dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );")
|
||||
dest.write("{increment}")
|
||||
|
||||
if not TRMM:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
with dest.map(idx=j*a_regs+i):
|
||||
dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );")
|
||||
dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );")
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
with dest.map(idx=j*a_regs+i):
|
||||
dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );")
|
||||
dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );")
|
||||
|
||||
if not TRMM:
|
||||
dest.write()
|
||||
dest.write("ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
with dest.map(idx=j*a_regs+i, increment=increment):
|
||||
dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);")
|
||||
dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);")
|
||||
dest.write("{increment}")
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_gemm_kernel( settings, OUTPUT ):
|
||||
if settings['conjugate'].value:
|
||||
ERROR('conjugate gemm not yet supported')
|
||||
|
||||
is_complex = settings['complex'].value
|
||||
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
|
||||
dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings })
|
||||
|
||||
M = settings['M'].value
|
||||
N = settings['N'].value
|
||||
vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value /
|
||||
settings['ELEN_PARAM'].value)
|
||||
a_regs = max(int(M/vlenmax), 1)
|
||||
|
||||
# for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
|
||||
accumulation_regs = a_regs * N
|
||||
required_regs = accumulation_regs + a_regs
|
||||
if is_complex:
|
||||
required_regs = required_regs * 2 + 2
|
||||
dest.write('''
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define S0 1
|
||||
#define S1 -1
|
||||
#define S2 1
|
||||
#define S3 1
|
||||
#define VFMACC_RR __riscv_vfmsac{tail_policy}
|
||||
#define VFMACC_RI __riscv_vfmacc{tail_policy}
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define S0 1
|
||||
#define S1 1
|
||||
#define S2 1
|
||||
#define S3 -1
|
||||
#define VFMACC_RR __riscv_vfmacc{tail_policy}
|
||||
#define VFMACC_RI __riscv_vfmsac{tail_policy}
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define S0 1
|
||||
#define S1 1
|
||||
#define S2 -1
|
||||
#define S3 1
|
||||
#define VFMACC_RR __riscv_vfmacc{tail_policy}
|
||||
#define VFMACC_RI __riscv_vfnmsac{tail_policy}
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define S0 1
|
||||
#define S1 -1
|
||||
#define S2 -1
|
||||
#define S3 -1
|
||||
#define VFMACC_RR __riscv_vfmsac{tail_policy}
|
||||
#define VFMACC_RI __riscv_vfnmacc{tail_policy}
|
||||
#endif
|
||||
'''.format(tail_policy=settings['tail_policy'].value))
|
||||
|
||||
|
||||
if required_regs > (32 // settings['LMUL_ACC'].value):
|
||||
raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format(
|
||||
required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value
|
||||
))
|
||||
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
if TRMM:
|
||||
with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"):
|
||||
dest.write("#define BACKWARDS")
|
||||
|
||||
dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})",
|
||||
alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'),
|
||||
trmm = (', BLASLONG offset' if TRMM else '')
|
||||
)
|
||||
|
||||
with dest.block("{{", "}}", elt_size='*2' if is_complex else ''):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);")
|
||||
dest.write("{index_type} gvl = 0;")
|
||||
dest.write("{index_type} m_top = 0;")
|
||||
dest.write("{index_type} n_top = 0;")
|
||||
|
||||
dest.write()
|
||||
dest.write()
|
||||
dest.write("// -- MAIN PASS")
|
||||
|
||||
with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"):
|
||||
dest.write("m_top = 0;")
|
||||
dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
|
||||
dest.write()
|
||||
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
|
||||
generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs )
|
||||
dest.write( "m_top += {M};" )
|
||||
|
||||
dest.write()
|
||||
dest.write()
|
||||
dest.write("// -- tails for main pass")
|
||||
generate_M_tails( dest, settings, M, N )
|
||||
|
||||
dest.write( "n_top += {N};" )
|
||||
|
||||
|
||||
N_tail = int(N/2)
|
||||
while( N_tail > 0 ):
|
||||
with dest.map(N=N_tail):
|
||||
dest.write()
|
||||
dest.write()
|
||||
dest.write("// -- tails for N={N}")
|
||||
with dest.block("if( N & {N} ) {{", "}}" ):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
|
||||
dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
|
||||
dest.write("m_top = 0;")
|
||||
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
|
||||
generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs )
|
||||
dest.write("m_top += {M};")
|
||||
|
||||
generate_M_tails( dest, settings, M, N_tail )
|
||||
dest.write("n_top += {N};")
|
||||
N_tail = int(N_tail/2)
|
||||
|
||||
dest.write("return 0;");
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_M_tails( dest, settings, M, N ):
|
||||
M_tail = int(M/2)
|
||||
M_tail_min = settings['M_tail_scalar_from'].value
|
||||
vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value
|
||||
/ settings['ELEN_PARAM'].value )
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
is_complex = settings['complex'].value
|
||||
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
|
||||
|
||||
while( M_tail > M_tail_min ):
|
||||
with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
|
||||
a_regs = max( 1, int(M_tail/vlenmax) )
|
||||
vlen = int(M_tail/a_regs)
|
||||
dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen)
|
||||
|
||||
generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs )
|
||||
dest.write( "m_top += {M_tail};" )
|
||||
|
||||
M_tail = int( M_tail / 2 )
|
||||
|
||||
while( M_tail > 0 ):
|
||||
with dest.block("if( M & {M_tail} ) {{", "}}",
|
||||
M_tail=M_tail,
|
||||
N=N,
|
||||
result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value)
|
||||
):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
|
||||
for r in range(M_tail * N * (2 if is_complex else 1)):
|
||||
dest.write("{result_t} result{r} = 0;",
|
||||
r=r
|
||||
)
|
||||
|
||||
dest.write("{index_type} ai=m_top*K{elt_size};")
|
||||
dest.write("{index_type} bi=n_top*K{elt_size};")
|
||||
|
||||
if TRMM:
|
||||
with dest.map(M=M_tail, N=N):
|
||||
generate_trmm_block( dest )
|
||||
|
||||
with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ):
|
||||
for ki in range( N ):
|
||||
for kj in range( M_tail ):
|
||||
if is_complex:
|
||||
dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format(
|
||||
dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2
|
||||
))
|
||||
dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format(
|
||||
dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2
|
||||
))
|
||||
else:
|
||||
dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format(
|
||||
dest=ki*M_tail+kj, kj=kj, ki=ki
|
||||
))
|
||||
dest.write("ai+={M_tail}{elt_size};")
|
||||
dest.write("bi+={N}{elt_size};")
|
||||
|
||||
dest.write("{index_type} ci=n_top*ldc+m_top;")
|
||||
if is_complex:
|
||||
dest.write("{result_t} Cr, Ci;")
|
||||
for ki in range( N ):
|
||||
for kj in range( M_tail ):
|
||||
if is_complex:
|
||||
if TRMM:
|
||||
dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
|
||||
dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
|
||||
else:
|
||||
dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj)
|
||||
dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj)
|
||||
dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
|
||||
dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
|
||||
dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1)
|
||||
dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0)
|
||||
dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj )
|
||||
dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj )
|
||||
else:
|
||||
op = '' if TRMM else '+'
|
||||
dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};",
|
||||
ki=ki, kj=kj, op=op, dest=ki*M_tail+kj
|
||||
)
|
||||
dest.write("m_top+={M_tail};")
|
||||
|
||||
M_tail = int(M_tail/2)
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
class Setting(object):
|
||||
def __init__( self, value, convert = None ):
|
||||
self._value = value
|
||||
self._convert = convert
|
||||
|
||||
@classmethod
|
||||
def ENUM( cls, *values ):
|
||||
def closure( values ):
|
||||
return lambda value: values[value.lower()]
|
||||
return closure( { v.lower():v for v in values } )
|
||||
|
||||
@classmethod
|
||||
def BOOL( cls, value ):
|
||||
return value.lower().startswith('t') or value == '1'
|
||||
|
||||
@property
|
||||
def value( self ):
|
||||
return self._value
|
||||
|
||||
@property
|
||||
def configurable( self ):
|
||||
return self._convert is not None
|
||||
|
||||
@value.setter
|
||||
def value( self, value ):
|
||||
self._value = self._convert( value )
|
||||
|
||||
def __str__( self ):
|
||||
return str(self._value)
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def main():
|
||||
settings = {
|
||||
'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ),
|
||||
'M': Setting( 16, int ),
|
||||
'N': Setting( 4, int ),
|
||||
'reg_width_bits': Setting( 256, int ),
|
||||
'LMUL': Setting( 1, int ),
|
||||
'M_tail_scalar_from':Setting( 2, int ),
|
||||
'cpu': Setting( 'zvl256b', str ),
|
||||
'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ),
|
||||
'force_acc_double': Setting( False, Setting.BOOL ),
|
||||
'complex': Setting( False, Setting.BOOL ),
|
||||
'conjugate': Setting( False, Setting.BOOL ),
|
||||
'index_type': Setting( 'BLASLONG', str ),
|
||||
'trace': Setting( False, Setting.BOOL ),
|
||||
'output': Setting( None, str ),
|
||||
'tail_policy': Setting( '', str ), # _ta, if toolchain supports it
|
||||
'__riscv_': Setting( '__riscv_', str),
|
||||
}
|
||||
|
||||
for item in sys.argv[1:]:
|
||||
try:
|
||||
name, value = tuple(item.split( '=', 1 ))
|
||||
except:
|
||||
ERROR("couldn't parse {}, expected arguments of the form name=value".format(item))
|
||||
|
||||
if name not in settings:
|
||||
ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name )
|
||||
+"options (and current defaults) are\n{}".format(
|
||||
" ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()]))
|
||||
)
|
||||
|
||||
try:
|
||||
settings[name].value = value
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
ERROR("couldn't parse {}".format(item))
|
||||
|
||||
if settings['output'].value is None:
|
||||
if settings['complex'].value:
|
||||
prefix = 'z' if settings['param_precision'].value == 'double' else 'c'
|
||||
else:
|
||||
prefix = 'd' if settings['param_precision'].value == 'double' else 's'
|
||||
settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format(
|
||||
prefix,
|
||||
settings['op'],
|
||||
settings['M'],
|
||||
settings['N'],
|
||||
settings['cpu']
|
||||
))
|
||||
|
||||
if settings['param_precision'].value == 'double':
|
||||
settings['param_scalar_t'] = Setting( 'double' )
|
||||
settings['ELEN_PARAM'] = Setting(64)
|
||||
else:
|
||||
settings['param_scalar_t'] = Setting( 'float' )
|
||||
settings['ELEN_PARAM'] = Setting(32)
|
||||
|
||||
settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
|
||||
settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
|
||||
|
||||
settings['ELEN_ACC'] = settings['ELEN_PARAM']
|
||||
settings['LMUL_ACC'] = Setting(settings['LMUL'].value)
|
||||
widen = ''
|
||||
|
||||
if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'):
|
||||
settings['ELEN_ACC'] = Setting(64)
|
||||
settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2)
|
||||
settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']))
|
||||
widen = 'w'
|
||||
|
||||
settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
|
||||
settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
|
||||
|
||||
settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC']))
|
||||
settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
|
||||
|
||||
to_stdout = (settings['output'].value == '-')
|
||||
if not to_stdout:
|
||||
print("Writing {}".format(settings['output'].value), file=sys.stderr)
|
||||
|
||||
with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file:
|
||||
def OUTPUT(*args, **kwargs):
|
||||
print(*args, file=destination_file, **kwargs)
|
||||
|
||||
OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable])))
|
||||
OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable])))
|
||||
|
||||
OUTPUT('#include "common.h"')
|
||||
OUTPUT("\n")
|
||||
|
||||
if settings['op'].value in ('gemm', 'trmm'):
|
||||
generate_gemm_kernel(settings, OUTPUT)
|
||||
else:
|
||||
ERROR("unsupported kernel type {}".format(settings['op']))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
149
kernel/riscv64/iamax_rvv.c
Normal file
149
kernel/riscv64/iamax_rvv.c
Normal file
@@ -0,0 +1,149 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
|
||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m8
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
|
||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m8
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_max_index = VMVVX_UINT(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT(-1, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max
|
||||
v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max
|
||||
v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
|
||||
max_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
|
||||
max_index = VMVVXS_UINT(v_max_index);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
||||
@@ -27,127 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m4_f64m1(v_res, va, vb, gvl)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m4)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m4_f64m1)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m4)
|
||||
#endif
|
||||
#define MASK_T vbool16_t
|
||||
#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m4_b16)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m4)
|
||||
#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m4_b16)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b16)
|
||||
#define UINT_V_T vuint64m4_t
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u64m4)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m4)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m4)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m4)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u64m4_u64)
|
||||
#else
|
||||
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m4_f32m1(v_res, va, vb, gvl)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m4)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m4_f32m1)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m4)
|
||||
#endif
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m4_b8)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m4)
|
||||
#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m4_b8)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
|
||||
#define UINT_V_T vuint32m4_t
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u32m4)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m4)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m4)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m4)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u32m4_u32)
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT maxf=0.0;
|
||||
#ifdef DOUBLE
|
||||
BLASLONG max_index = 0;
|
||||
#else
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int max_index = 0;
|
||||
#endif
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
gvl = VSETVL(n);
|
||||
UINT_V_T vid = VIDV_UINT(gvl);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
v_max = VFMVVF_FLOAT(-1, gvl);
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
#ifdef DOUBLE
|
||||
max_index = *((BLASLONG *)&v_max_index+max_index);
|
||||
#else
|
||||
max_index = *((unsigned int *)&v_max_index+max_index);
|
||||
#endif
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_max = VLEV_FLOAT(&x[j], gvl);
|
||||
v_max = VFABS_FLOAT(v_max, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
v_max_index = VADDVX_UINT(vid, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
#ifdef DOUBLE
|
||||
max_index = *((BLASLONG*)&v_max_index+max_index);
|
||||
#else
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
#endif
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
@@ -155,56 +151,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
unsigned int idx = 0, inc_v = gvl * inc_x;
|
||||
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
v_max = VFMVVF_FLOAT(-1, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
#ifdef DOUBLE
|
||||
max_index = *((BLASLONG*)&v_max_index+max_index);
|
||||
#else
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
#endif
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
v_max = VFABS_FLOAT(v_max, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
v_max_index = VADDVX_UINT(vid, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
#ifdef DOUBLE
|
||||
max_index = *((BLASLONG*)&v_max_index+max_index);
|
||||
#else
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
#endif
|
||||
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(max_index+1);
|
||||
return(max_index+1);
|
||||
}
|
||||
|
||||
150
kernel/riscv64/iamin_rvv.c
Normal file
150
kernel/riscv64/iamin_rvv.c
Normal file
@@ -0,0 +1,150 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
|
||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m8
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
|
||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m8
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_min_index = VMVVX_UINT(0, vlmax);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
|
||||
min_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
|
||||
min_index = VMVVXS_UINT(v_min_index);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
||||
@@ -31,85 +31,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
|
||||
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
|
||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
|
||||
#endif
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
|
||||
#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
|
||||
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
|
||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
|
||||
#endif
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
|
||||
#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT minf=FLT_MAX;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
FLOAT minf=FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
@@ -117,29 +125,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_min = VLEV_FLOAT(&x[j], gvl);
|
||||
v_min = VFABS_FLOAT(v_min, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf < minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
@@ -151,12 +159,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
@@ -165,33 +171,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
v_min = VFABS_FLOAT(v_min, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf < minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(min_index+1);
|
||||
return(min_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
146
kernel/riscv64/imax_rvv.c
Normal file
146
kernel/riscv64/imax_rvv.c
Normal file
@@ -0,0 +1,146 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
|
||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
|
||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_max_index = VMVVX_UINT(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
|
||||
max_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
|
||||
max_index = VMVVXS_UINT(v_max_index);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
||||
@@ -31,68 +31,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m8_f64m1(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
|
||||
#endif
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
|
||||
#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m8_f32m1(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
|
||||
#endif
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
|
||||
#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_min;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
@@ -104,32 +116,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_max = VLEV_FLOAT(&x[j], gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
FLOAT cur_maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
@@ -145,37 +159,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
FLOAT cur_maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(max_index+1);
|
||||
return(max_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
146
kernel/riscv64/imin_rvv.c
Normal file
146
kernel/riscv64/imin_rvv.c
Normal file
@@ -0,0 +1,146 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
|
||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u64m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
|
||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu
|
||||
#define VFIRSTM __riscv_vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u32m8
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_min_index = VMVVX_UINT(0, vlmax);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
|
||||
min_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
|
||||
min_index = VMVVXS_UINT(v_min_index);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
||||
@@ -31,122 +31,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u64m8_m)(mask, v_min_index, gvl)
|
||||
#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u64m8_m)(mask, a, a, b, gvl)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
|
||||
#endif
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
|
||||
#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u32m8_m)(mask, v_min_index, gvl)
|
||||
#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u32m8_m)(mask, a, a, b, gvl)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
|
||||
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m
|
||||
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
|
||||
#endif
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
|
||||
#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT minf=FLT_MAX;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
FLOAT minf=FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
|
||||
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_min = VLEV_FLOAT(&x[j], gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
if(cur_minf < minf){
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf > minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
@@ -159,59 +156,39 @@ asm volatile(
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
if(cur_minf < minf){
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf > minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(min_index+1);
|
||||
return(min_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
172
kernel/riscv64/izamax_rvv.c
Normal file
172
kernel/riscv64/izamax_rvv.c
Normal file
@@ -0,0 +1,172 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m4()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_VX2_T vfloat64m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1
|
||||
#define MASK_T vbool16_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16
|
||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m4
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4
|
||||
#define VFIRSTM __riscv_vfirst_m_b16
|
||||
#define UINT_V_T vuint64m4_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u64m4
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u64m4
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m4()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_VX2_T vfloat32m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8
|
||||
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m4
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4
|
||||
#define VFIRSTM __riscv_vfirst_m_b8
|
||||
#define UINT_V_T vuint32m4_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u32m4
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u32m4
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_max;
|
||||
FLOAT_VX2_T vxx2;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_max_index = VMVVX_UINT(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT(-1, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vxx2 = VLSEG_FLOAT(x, vl);
|
||||
|
||||
vx0 = VGET_VX2(vxx2, 0);
|
||||
vx1 = VGET_VX2(vxx2, 1);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx0, vl);
|
||||
v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
|
||||
|
||||
vx0 = VGET_VX2(vxx2, 0);
|
||||
vx1 = VGET_VX2(vxx2, 1);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx0, vl);
|
||||
v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
|
||||
max_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
|
||||
max_index = VMVVXS_UINT(v_max_index);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
||||
@@ -27,241 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f64m8_f64m1)(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
|
||||
#endif
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
|
||||
#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VSEVU_UINT vse64_v_u64m8
|
||||
#define VSEVU_UINT RISCV_RVV(vse64_v_u64m8)
|
||||
#define UINT_T long unsigned int
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
|
||||
#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f32m8_f32m1)(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
|
||||
#endif
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
|
||||
#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define UINT_T unsigned int
|
||||
#define VSEVU_UINT vse32_v_u32m8
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
|
||||
#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
|
||||
#endif
|
||||
|
||||
#define RVV_M RVV_M8
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT maxf=0.0;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_max;
|
||||
FLOAT_V_T vx, vx2, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask0, mask1;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
gvl = VSETVL(n);
|
||||
UINT_T temp_uint[gvl];
|
||||
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
unsigned int idx = 0, inc_v = gvl * inc_x * 2;
|
||||
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
v_max = VFMVVF_FLOAT(-1, gvl);
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = gvl * inc_x * 2;
|
||||
BLASLONG ix = 0;
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
vx = VFADDVV_FLOAT(vx, vx2, gvl);
|
||||
|
||||
|
||||
//index where element greater than v_max
|
||||
mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_max_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_max_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl);
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
idx += inc_v;
|
||||
}
|
||||
vx0 = VFMVVF_FLOAT(0, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_max_index,gvl);
|
||||
max_index = temp_uint[max_index];
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
|
||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
v_max = VFABS_FLOAT(v_max, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
v_max = VFADDVV_FLOAT(v_max, vx2, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_max_index,gvl);
|
||||
max_index = temp_uint[max_index];
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_max_index, mask, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
return(max_index+1);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
171
kernel/riscv64/izamin_rvv.c
Normal file
171
kernel/riscv64/izamin_rvv.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m4()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_VX2_T vfloat64m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1
|
||||
#define MASK_T vbool16_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16
|
||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m4_b16
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m4
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4
|
||||
#define VFIRSTM __riscv_vfirst_m_b16
|
||||
#define UINT_V_T vuint64m4_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u64m4
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u64m4
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u64m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m4()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_VX2_T vfloat32m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8
|
||||
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8
|
||||
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m4_b8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m4
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4
|
||||
#define VFIRSTM __riscv_vfirst_m_b8
|
||||
#define UINT_V_T vuint32m4_t
|
||||
#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu
|
||||
#define VIDV_UINT __riscv_vid_v_u32m4
|
||||
#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu
|
||||
#define VADDVX_UINT __riscv_vadd_vx_u32m4
|
||||
#define VMVVX_UINT __riscv_vmv_v_x_u32m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4
|
||||
#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_min;
|
||||
FLOAT_VX2_T vxx2;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_min_index = VMVVX_UINT(0, vlmax);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vxx2 = VLSEG_FLOAT(x, vl);
|
||||
|
||||
vx0 = VGET_VX2(vxx2, 0);
|
||||
vx1 = VGET_VX2(vxx2, 1);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx0, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
|
||||
|
||||
vx0 = VGET_VX2(vxx2, 0);
|
||||
vx1 = VGET_VX2(vxx2, 1);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx0, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
|
||||
min_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
|
||||
min_index = VMVVXS_UINT(v_min_index);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
||||
@@ -31,235 +31,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f64m8_f64m1)(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
|
||||
#endif
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
|
||||
#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VSEVU_UINT vse64_v_u64m8
|
||||
#define UINT_T long unsigned int
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
|
||||
#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f32m8_f32m1)(v_res, va, vb, gvl)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
|
||||
#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f32m8_f32m1)
|
||||
#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
|
||||
#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
|
||||
#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
|
||||
#endif
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
|
||||
#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
|
||||
#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define UINT_T unsigned int
|
||||
#define VSEVU_UINT vse32_v_u32m8
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
|
||||
#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
|
||||
#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
|
||||
#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
|
||||
#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
|
||||
#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
|
||||
#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT minf=FLT_MAX;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
FLOAT minf=FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_min;
|
||||
FLOAT_V_T vx, vx2, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask0, mask1;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
gvl = VSETVL(n);
|
||||
UINT_T temp_uint[gvl];
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = gvl * inc_x * 2;
|
||||
BLASLONG ix = 0;
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
unsigned int idx = 0, inc_v = gvl * inc_x * 2;
|
||||
|
||||
//index where element less than v_min
|
||||
mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
vx = VFADDVV_FLOAT(vx, vx2, gvl);
|
||||
|
||||
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = VFMVFS_FLOAT(v_res);
|
||||
mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_min_index,gvl);
|
||||
min_index = temp_uint[min_index];
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = VFMVFS_FLOAT(v_res);
|
||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
v_min = VFABS_FLOAT(v_min, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
v_min = VFADDVV_FLOAT(v_min, vx2, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf < minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_min_index,gvl);
|
||||
min_index = temp_uint[min_index];
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(v_min_index, mask, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
return(min_index+1);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
98
kernel/riscv64/max_rvv.c
Normal file
98
kernel/riscv64/max_rvv.c
Normal file
@@ -0,0 +1,98 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
FLOAT_V_T vx, vmax;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(maxf);
|
||||
}
|
||||
@@ -28,30 +28,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 32
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 16
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
@@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_max;
|
||||
FLOAT_V_T_M1 v_res, v_min;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
@@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
@@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
j += gvl * 2;
|
||||
idx += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
|
||||
98
kernel/riscv64/min_rvv.c
Normal file
98
kernel/riscv64/min_rvv.c
Normal file
@@ -0,0 +1,98 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
FLOAT_V_T vx, vmin;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(minf);
|
||||
}
|
||||
@@ -28,30 +28,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 32
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 16
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
@@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
FLOAT minf=FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_min;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
@@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
@@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
j += gvl * 2;
|
||||
idx += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
return(minf);
|
||||
}
|
||||
|
||||
|
||||
212
kernel/riscv64/nrm2_rvv.c
Normal file
212
kernel/riscv64/nrm2_rvv.c
Normal file
@@ -0,0 +1,212 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL __riscv_vsetvl_e64m4
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m4
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
|
||||
#define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define MASK_T vbool16_t
|
||||
#define VFABS __riscv_vfabs_v_f64m4
|
||||
#define VMFNE __riscv_vmfne_vf_f64m4_b16
|
||||
#define VMFGT __riscv_vmfgt_vv_f64m4_b16
|
||||
#define VMFEQ __riscv_vmfeq_vf_f64m4_b16
|
||||
#define VCPOP __riscv_vcpop_m_b16
|
||||
#define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1
|
||||
#define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1
|
||||
#define VFIRST __riscv_vfirst_m_b16
|
||||
#define VRGATHER __riscv_vrgather_vx_f64m4
|
||||
#define VFDIV __riscv_vfdiv_vv_f64m4
|
||||
#define VFDIV_M __riscv_vfdiv_vv_f64m4_mu
|
||||
#define VFMUL __riscv_vfmul_vv_f64m4
|
||||
#define VFMUL_M __riscv_vfmul_vv_f64m4_mu
|
||||
#define VFMACC __riscv_vfmacc_vv_f64m4
|
||||
#define VFMACC_M __riscv_vfmacc_vv_f64m4_mu
|
||||
#define VMSBF __riscv_vmsbf_m_b16
|
||||
#define VMSOF __riscv_vmsof_m_b16
|
||||
#define VMAND __riscv_vmand_mm_b16
|
||||
#define VMANDN __riscv_vmand_mm_b16
|
||||
#define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1
|
||||
#define VMERGE __riscv_vmerge_vvm_f64m4
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m4
|
||||
#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define VSETVL __riscv_vsetvl_e32m4
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m4
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
|
||||
#define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VFABS __riscv_vfabs_v_f32m4
|
||||
#define VMFNE __riscv_vmfne_vf_f32m4_b8
|
||||
#define VMFGT __riscv_vmfgt_vv_f32m4_b8
|
||||
#define VMFEQ __riscv_vmfeq_vf_f32m4_b8
|
||||
#define VCPOP __riscv_vcpop_m_b8
|
||||
#define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1
|
||||
#define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1
|
||||
#define VFIRST __riscv_vfirst_m_b8
|
||||
#define VRGATHER __riscv_vrgather_vx_f32m4
|
||||
#define VFDIV __riscv_vfdiv_vv_f32m4
|
||||
#define VFDIV_M __riscv_vfdiv_vv_f32m4_mu
|
||||
#define VFMUL __riscv_vfmul_vv_f32m4
|
||||
#define VFMUL_M __riscv_vfmul_vv_f32m4_mu
|
||||
#define VFMACC __riscv_vfmacc_vv_f32m4
|
||||
#define VFMACC_M __riscv_vfmacc_vv_f32m4_mu
|
||||
#define VMSBF __riscv_vmsbf_m_b8
|
||||
#define VMSOF __riscv_vmsof_m_b8
|
||||
#define VMAND __riscv_vmand_mm_b8
|
||||
#define VMANDN __riscv_vmand_mm_b8
|
||||
#define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1
|
||||
#define VMERGE __riscv_vmerge_vvm_f32m4
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m4
|
||||
#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v)
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if(n == 1) return (ABS(x[0]));
|
||||
|
||||
unsigned int gvl = 0;
|
||||
|
||||
MASK_T nonzero_mask;
|
||||
MASK_T scale_mask;
|
||||
|
||||
gvl = VSETVL(n);
|
||||
FLOAT_V_T v0;
|
||||
FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
|
||||
FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
|
||||
|
||||
FLOAT scale = 0;
|
||||
FLOAT ssq = 0;
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
int idx = 0;
|
||||
|
||||
if( n >= gvl ) // don't pay overheads if we're not doing useful work
|
||||
{
|
||||
for(i=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
|
||||
nonzero_mask = VMFNE( v0, 0, gvl );
|
||||
v0 = VFABS( v0, gvl );
|
||||
scale_mask = VMFGT( v0, v_scale, gvl );
|
||||
|
||||
// assume scale changes are relatively infrequent
|
||||
|
||||
// unclear if the vcpop+branch is actually a win
|
||||
// since the operations being skipped are predicated anyway
|
||||
// need profiling to confirm
|
||||
if( VCPOP(scale_mask, gvl) )
|
||||
{
|
||||
v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
|
||||
v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
|
||||
v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
|
||||
v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
|
||||
}
|
||||
v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
|
||||
v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
|
||||
idx += inc_x * gvl;
|
||||
}
|
||||
|
||||
// we have gvl elements which we accumulated independently, with independent scales
|
||||
// we need to combine these
|
||||
// naive sort so we process small values first to avoid losing information
|
||||
// could use vector sort extensions where available, but we're dealing with gvl elts at most
|
||||
|
||||
FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
|
||||
FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
|
||||
VSEV_FLOAT( out_ssq, v_ssq, gvl );
|
||||
VSEV_FLOAT( out_scale, v_scale, gvl );
|
||||
for( int a = 0; a < (gvl-1); ++a )
|
||||
{
|
||||
int smallest = a;
|
||||
for( size_t b = a+1; b < gvl; ++b )
|
||||
if( out_scale[b] < out_scale[smallest] )
|
||||
smallest = b;
|
||||
if( smallest != a )
|
||||
{
|
||||
FLOAT tmp1 = out_ssq[a];
|
||||
FLOAT tmp2 = out_scale[a];
|
||||
out_ssq[a] = out_ssq[smallest];
|
||||
out_scale[a] = out_scale[smallest];
|
||||
out_ssq[smallest] = tmp1;
|
||||
out_scale[smallest] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int a = 0;
|
||||
while( a<gvl && out_scale[a] == 0 )
|
||||
++a;
|
||||
|
||||
if( a < gvl )
|
||||
{
|
||||
ssq = out_ssq[a];
|
||||
scale = out_scale[a];
|
||||
++a;
|
||||
for( ; a < gvl; ++a )
|
||||
{
|
||||
ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
|
||||
scale = out_scale[a];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//finish any tail using scalar ops
|
||||
i*=gvl*inc_x;
|
||||
n*=inc_x;
|
||||
while(i < n){
|
||||
if ( x[i] != 0.0 ){
|
||||
FLOAT absxi = ABS( x[i] );
|
||||
if ( scale < absxi ){
|
||||
ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
|
||||
scale = absxi ;
|
||||
}
|
||||
else{
|
||||
ssq += ( absxi/scale ) * ( absxi/scale );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
i += inc_x;
|
||||
}
|
||||
|
||||
return(scale * sqrt(ssq));
|
||||
}
|
||||
|
||||
|
||||
@@ -26,207 +26,189 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4
|
||||
#define ABS fabsf
|
||||
#define MASK_T vbool8_t
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
|
||||
#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VFDIVVF_FLOAT vfdiv_vf_f32m4
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m1
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 64
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 32
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4
|
||||
#define ABS fabs
|
||||
#define MASK_T vbool16_t
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
|
||||
#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
|
||||
#define VMFIRSTM vmfirst_m_b16
|
||||
#define VFDIVVF_FLOAT vfdiv_vf_f64m4
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
|
||||
# define LMUL m4
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 16
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 8
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVSF_FLOAT JOIN(RISCV_RVV(vfmv), _s_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VFABS JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _)
|
||||
#define VMFNE JOIN(RISCV_RVV(vmfne_vf_f),ELEN, LMUL, _b, MLEN)
|
||||
#define VMFGT JOIN(RISCV_RVV(vmfgt_vv_f),ELEN, LMUL, _b, MLEN)
|
||||
#define VMFEQ JOIN(RISCV_RVV(vmfeq_vf_f),ELEN, LMUL, _b, MLEN)
|
||||
#define VCPOP JOIN(RISCV_RVV(vcpop), _m_b, MLEN, _, _)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _m)
|
||||
#define VFMUL_M JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _m)
|
||||
#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _m)
|
||||
#define VMERGE(a, b, mask, gvl) JOIN(RISCV_RVV(vmerge), _vvm_f, ELEN, LMUL, _)(mask, a, b, gvl)
|
||||
#else
|
||||
#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _mu)
|
||||
#define VFMUL_M JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _mu)
|
||||
#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _mu)
|
||||
#define VMERGE JOIN(RISCV_RVV(vmerge), _vvm_f, ELEN, LMUL, _)
|
||||
#endif
|
||||
#define VFIRST JOIN(RISCV_RVV(vfirst), _m_b, MLEN, _, _)
|
||||
#define VRGATHER JOIN(RISCV_RVV(vrgather), _vx_f, ELEN, LMUL, _)
|
||||
#define VFDIV JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _)
|
||||
#define VFMUL JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _)
|
||||
#define VFMACC JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _)
|
||||
#define VMSBF JOIN(RISCV_RVV(vmsbf), _m_b, MLEN, _, _)
|
||||
#define VMSOF JOIN(RISCV_RVV(vmsof), _m_b, MLEN, _, _)
|
||||
#define VMAND JOIN(RISCV_RVV(vmand), _mm_b, MLEN, _, _)
|
||||
#define VMANDN JOIN(RISCV_RVV(vmandn), _mm_b, MLEN, _, _)
|
||||
|
||||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v)
|
||||
|
||||
//#define DUMP( label, v0, gvl )
|
||||
#define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG i=0;
|
||||
|
||||
if ( n < 0 ) return(0.0);
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if(n == 1) return (ABS(x[0]));
|
||||
|
||||
FLOAT_V_T vr, v0, v_zero;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
|
||||
FLOAT scale = 0.0, ssq = 0.0;
|
||||
MASK_T mask;
|
||||
BLASLONG index = 0;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
//ssq in vector vr
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
j += gvl;
|
||||
}
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
MASK_T nonzero_mask;
|
||||
MASK_T scale_mask;
|
||||
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0)
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}else{//found greater element
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
int idx = 0, inc_v = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
//ssq in vector vr
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
gvl = VSETVL(n);
|
||||
FLOAT_V_T v0;
|
||||
FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
|
||||
FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
|
||||
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0)
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}else{//found greater element
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOATM4(vr);
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
FLOAT scale = 0;
|
||||
FLOAT ssq = 0;
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
int idx = 0;
|
||||
|
||||
if( n >= gvl ) // don't pay overheads if we're not doing useful work
|
||||
{
|
||||
for(i=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
|
||||
nonzero_mask = VMFNE( v0, 0, gvl );
|
||||
v0 = VFABS( v0, gvl );
|
||||
scale_mask = VMFGT( v0, v_scale, gvl );
|
||||
|
||||
// assume scale changes are relatively infrequent
|
||||
|
||||
// unclear if the vcpop+branch is actually a win
|
||||
// since the operations being skipped are predicated anyway
|
||||
// need profiling to confirm
|
||||
if( VCPOP(scale_mask, gvl) )
|
||||
{
|
||||
v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
|
||||
v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
|
||||
v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
|
||||
v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
|
||||
}
|
||||
v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
|
||||
v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
|
||||
idx += inc_x * gvl;
|
||||
}
|
||||
|
||||
// we have gvl elements which we accumulated independently, with independent scales
|
||||
// we need to combine these
|
||||
// naive sort so we process small values first to avoid losing information
|
||||
// could use vector sort extensions where available, but we're dealing with gvl elts at most
|
||||
|
||||
FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
|
||||
FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
|
||||
VSEV_FLOAT( out_ssq, v_ssq, gvl );
|
||||
VSEV_FLOAT( out_scale, v_scale, gvl );
|
||||
for( int a = 0; a < (gvl-1); ++a )
|
||||
{
|
||||
int smallest = a;
|
||||
for( size_t b = a+1; b < gvl; ++b )
|
||||
if( out_scale[b] < out_scale[smallest] )
|
||||
smallest = b;
|
||||
if( smallest != a )
|
||||
{
|
||||
FLOAT tmp1 = out_ssq[a];
|
||||
FLOAT tmp2 = out_scale[a];
|
||||
out_ssq[a] = out_ssq[smallest];
|
||||
out_scale[a] = out_scale[smallest];
|
||||
out_ssq[smallest] = tmp1;
|
||||
out_scale[smallest] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int a = 0;
|
||||
while( a<gvl && out_scale[a] == 0 )
|
||||
++a;
|
||||
|
||||
if( a < gvl )
|
||||
{
|
||||
ssq = out_ssq[a];
|
||||
scale = out_scale[a];
|
||||
++a;
|
||||
for( ; a < gvl; ++a )
|
||||
{
|
||||
ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
|
||||
scale = out_scale[a];
|
||||
}
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
|
||||
//finish any tail using scalar ops
|
||||
i*=gvl*inc_x;
|
||||
n*=inc_x;
|
||||
while(i < n){
|
||||
if ( x[i] != 0.0 ){
|
||||
FLOAT absxi = ABS( x[i] );
|
||||
if ( scale < absxi ){
|
||||
ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
|
||||
scale = absxi ;
|
||||
}
|
||||
else{
|
||||
ssq += ( absxi/scale ) * ( absxi/scale );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
i += inc_x;
|
||||
}
|
||||
|
||||
return(scale * sqrt(ssq));
|
||||
}
|
||||
|
||||
|
||||
@@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
@@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
BLASLONG i=0, j=0;
|
||||
double len = 0.0 ;
|
||||
|
||||
if ( n < 0 ) return(0.0);
|
||||
if ( n <= 0 ) return(0.0);
|
||||
if(n == 1) return (ABS(x[0]));
|
||||
|
||||
FLOAT_V_T vr, v0, v1;
|
||||
|
||||
149
kernel/riscv64/rot_rvv.c
Normal file
149
kernel/riscv64/rot_rvv.c
Normal file
@@ -0,0 +1,149 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8
|
||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
|
||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
|
||||
if(n <= 0) return(0);
|
||||
|
||||
FLOAT_V_T v0, v1, vx, vy;
|
||||
|
||||
if (inc_x == 0 || inc_y == 0) {
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[ix] + s*y[iy] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
x[ix] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSEV_FLOAT(x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSEV_FLOAT(y, v1, vl);
|
||||
}
|
||||
|
||||
} else if(inc_y == 1) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSSEV_FLOAT(x, stride_x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSEV_FLOAT(y, v1, vl);
|
||||
}
|
||||
|
||||
} else if(inc_x == 1) {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSEV_FLOAT(x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v1, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSSEV_FLOAT(x, stride_x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
|
||||
#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
|
||||
#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4)
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
|
||||
#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
|
||||
#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
@@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if(n <= 0) return(0);
|
||||
unsigned int gvl = 0;
|
||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
|
||||
FLOAT_V_T v0, v1, vx, vy;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vy = VLEV_FLOAT(&y[j], gvl);
|
||||
@@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||
VSEV_FLOAT(&y[j], v1, gvl);
|
||||
}
|
||||
}else if(inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
@@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||
VSEV_FLOAT(&y[j], v1, gvl);
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG inc_yv = inc_y * gvl;
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
@@ -154,8 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||
VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
|
||||
97
kernel/riscv64/scal_rvv.c
Normal file
97
kernel/riscv64/scal_rvv.c
Normal file
@@ -0,0 +1,97 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
if ( (n <= 0) || (inc_x <= 0)) return(0);
|
||||
|
||||
FLOAT_V_T v0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
if(da == 0.0) {
|
||||
int gvl = VSETVL_MAX;
|
||||
v0 = VFMVVF_FLOAT(0.0, gvl);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
VSEV_FLOAT(x, v0, vl);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
v0 = VFMULVF_FLOAT(v0, da, vl);
|
||||
VSEV_FLOAT(x, v0, vl);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
if(da == 0.0) {
|
||||
int gvl = VSETVL_MAX;
|
||||
v0 = VFMVVF_FLOAT(0.0, gvl);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
VSSEV_FLOAT(x, stride_x, v0, vl);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
v0 = VFMULVF_FLOAT(v0, da, vl);
|
||||
VSSEV_FLOAT(x, stride_x, v0, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 32
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 16
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _)
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
@@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
}
|
||||
}else{
|
||||
if(da == 0.0){
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG ix = 0;
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG ix = 0;
|
||||
if(gvl <= n / 2){
|
||||
long int inc_xv = gvl * inc_x;
|
||||
v0 = VFMVVF_FLOAT(0, gvl);
|
||||
for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl);
|
||||
ix += inc_xv * 2;
|
||||
}
|
||||
v0 = VFMVVF_FLOAT(0, gvl);
|
||||
|
||||
for(i = 0; i < n/(gvl*2); ++i ){
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
ix += inc_x * gvl;
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
ix += inc_x * gvl;
|
||||
}
|
||||
//tail
|
||||
for(; j <n; ){
|
||||
gvl = VSETVL(n-j);
|
||||
|
||||
i *= gvl*2;
|
||||
while( i < n ){
|
||||
gvl = VSETVL(n-i);
|
||||
v0 = VFMVVF_FLOAT(0, gvl);
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
j += gvl;
|
||||
ix += inc_x * gvl;
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
i += gvl;
|
||||
ix += inc_x * gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
|
||||
1081
kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
Normal file
1081
kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
Normal file
File diff suppressed because it is too large
Load Diff
791
kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
Normal file
791
kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
Normal file
@@ -0,0 +1,791 @@
|
||||
/*
|
||||
|
||||
AUTOGENERATED KERNEL
|
||||
Script: ./kernel/riscv64/generate_kernel.py
|
||||
Settings:
|
||||
LMUL=2
|
||||
M=8
|
||||
M_tail_scalar_from=2
|
||||
N=8
|
||||
__riscv_='__riscv_'
|
||||
complex=False
|
||||
conjugate=False
|
||||
cpu='zvl128b'
|
||||
force_acc_double=False
|
||||
index_type='BLASLONG'
|
||||
op='gemm'
|
||||
param_precision='float'
|
||||
reg_width_bits=128
|
||||
tail_policy=''
|
||||
trace=False
|
||||
|
||||
Derived:
|
||||
ELEN_ACC=32
|
||||
ELEN_PARAM=32
|
||||
LMUL_ACC=2
|
||||
VFMACC='__riscv_vfmacc_vf_f32m2'
|
||||
VFMUL='__riscv_vfmul_vf_f32m2'
|
||||
VLEV='__riscv_vle32_v_f32m2'
|
||||
VLSEV='__riscv_vlse32_v_f32m2'
|
||||
VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
|
||||
VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
|
||||
VSETVL='__riscv_vsetvl_e32m2'
|
||||
VSEV='__riscv_vse32_v_f32m2'
|
||||
VSSEV='__riscv_vsse32_v_f32m2'
|
||||
acc_vector_t='vfloat32m2_t'
|
||||
output='sgemm_kernel_8x8_zvl128b.c'
|
||||
param_scalar_t='float'
|
||||
param_vector_t='vfloat32m2_t'
|
||||
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
|
||||
|
||||
{
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
|
||||
// -- MAIN PASS
|
||||
|
||||
for (BLASLONG j = 0; j < N / 8; j += 1) {
|
||||
m_top = 0;
|
||||
BLASLONG gvl = __riscv_vsetvl_e32m2(8);
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
float B4 = B[bi + 4];
|
||||
float B5 = B[bi + 5];
|
||||
float B6 = B[bi + 6];
|
||||
float B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
|
||||
vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
|
||||
vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
|
||||
vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
B4 = B[bi + 4];
|
||||
B5 = B[bi + 5];
|
||||
B6 = B[bi + 6];
|
||||
B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c4, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c5, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c6, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c7, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
// -- tails for main pass
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
float B4 = B[bi + 4];
|
||||
float B5 = B[bi + 5];
|
||||
float B6 = B[bi + 6];
|
||||
float B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
|
||||
vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
|
||||
vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
|
||||
vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
B4 = B[bi + 4];
|
||||
B5 = B[bi + 5];
|
||||
B6 = B[bi + 6];
|
||||
B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
|
||||
c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
|
||||
c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
|
||||
c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c4, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c5, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c6, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c7, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
float result8 = 0;
|
||||
float result9 = 0;
|
||||
float result10 = 0;
|
||||
float result11 = 0;
|
||||
float result12 = 0;
|
||||
float result13 = 0;
|
||||
float result14 = 0;
|
||||
float result15 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
result4 += A[ai + 0] * B[bi + 2];
|
||||
result5 += A[ai + 1] * B[bi + 2];
|
||||
result6 += A[ai + 0] * B[bi + 3];
|
||||
result7 += A[ai + 1] * B[bi + 3];
|
||||
result8 += A[ai + 0] * B[bi + 4];
|
||||
result9 += A[ai + 1] * B[bi + 4];
|
||||
result10 += A[ai + 0] * B[bi + 5];
|
||||
result11 += A[ai + 1] * B[bi + 5];
|
||||
result12 += A[ai + 0] * B[bi + 6];
|
||||
result13 += A[ai + 1] * B[bi + 6];
|
||||
result14 += A[ai + 0] * B[bi + 7];
|
||||
result15 += A[ai + 1] * B[bi + 7];
|
||||
ai += 2;
|
||||
bi += 8;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
C[ci + 1 * ldc + 0] += alpha * result2;
|
||||
C[ci + 1 * ldc + 1] += alpha * result3;
|
||||
C[ci + 2 * ldc + 0] += alpha * result4;
|
||||
C[ci + 2 * ldc + 1] += alpha * result5;
|
||||
C[ci + 3 * ldc + 0] += alpha * result6;
|
||||
C[ci + 3 * ldc + 1] += alpha * result7;
|
||||
C[ci + 4 * ldc + 0] += alpha * result8;
|
||||
C[ci + 4 * ldc + 1] += alpha * result9;
|
||||
C[ci + 5 * ldc + 0] += alpha * result10;
|
||||
C[ci + 5 * ldc + 1] += alpha * result11;
|
||||
C[ci + 6 * ldc + 0] += alpha * result12;
|
||||
C[ci + 6 * ldc + 1] += alpha * result13;
|
||||
C[ci + 7 * ldc + 0] += alpha * result14;
|
||||
C[ci + 7 * ldc + 1] += alpha * result15;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
result2 += A[ai + 0] * B[bi + 2];
|
||||
result3 += A[ai + 0] * B[bi + 3];
|
||||
result4 += A[ai + 0] * B[bi + 4];
|
||||
result5 += A[ai + 0] * B[bi + 5];
|
||||
result6 += A[ai + 0] * B[bi + 6];
|
||||
result7 += A[ai + 0] * B[bi + 7];
|
||||
ai += 1;
|
||||
bi += 8;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
C[ci + 2 * ldc + 0] += alpha * result2;
|
||||
C[ci + 3 * ldc + 0] += alpha * result3;
|
||||
C[ci + 4 * ldc + 0] += alpha * result4;
|
||||
C[ci + 5 * ldc + 0] += alpha * result5;
|
||||
C[ci + 6 * ldc + 0] += alpha * result6;
|
||||
C[ci + 7 * ldc + 0] += alpha * result7;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 8;
|
||||
}
|
||||
|
||||
// -- tails for N=4
|
||||
|
||||
if (N & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
|
||||
c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
result4 += A[ai + 0] * B[bi + 2];
|
||||
result5 += A[ai + 1] * B[bi + 2];
|
||||
result6 += A[ai + 0] * B[bi + 3];
|
||||
result7 += A[ai + 1] * B[bi + 3];
|
||||
ai += 2;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
C[ci + 1 * ldc + 0] += alpha * result2;
|
||||
C[ci + 1 * ldc + 1] += alpha * result3;
|
||||
C[ci + 2 * ldc + 0] += alpha * result4;
|
||||
C[ci + 2 * ldc + 1] += alpha * result5;
|
||||
C[ci + 3 * ldc + 0] += alpha * result6;
|
||||
C[ci + 3 * ldc + 1] += alpha * result7;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
result2 += A[ai + 0] * B[bi + 2];
|
||||
result3 += A[ai + 0] * B[bi + 3];
|
||||
ai += 1;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
C[ci + 2 * ldc + 0] += alpha * result2;
|
||||
C[ci + 3 * ldc + 0] += alpha * result3;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 4;
|
||||
}
|
||||
|
||||
// -- tails for N=2
|
||||
|
||||
if (N & 2) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
ai += 2;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
C[ci + 1 * ldc + 0] += alpha * result2;
|
||||
C[ci + 1 * ldc + 1] += alpha * result3;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
ai += 1;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 1 * ldc + 0] += alpha * result1;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 2;
|
||||
}
|
||||
|
||||
// -- tails for N=1
|
||||
|
||||
if (N & 1) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
float B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
|
||||
c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
|
||||
|
||||
ci = n_top * ldc + m_top;
|
||||
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
ai += 2;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
C[ci + 0 * ldc + 1] += alpha * result1;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
|
||||
for (BLASLONG k = 0; k < K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
ai += 1;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] += alpha * result0;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
1330
kernel/riscv64/strmm_kernel_16x8_zvl256b.c
Normal file
1330
kernel/riscv64/strmm_kernel_16x8_zvl256b.c
Normal file
File diff suppressed because it is too large
Load Diff
991
kernel/riscv64/strmm_kernel_8x8_zvl128b.c
Normal file
991
kernel/riscv64/strmm_kernel_8x8_zvl128b.c
Normal file
@@ -0,0 +1,991 @@
|
||||
/*
|
||||
|
||||
AUTOGENERATED KERNEL
|
||||
Script: ./kernel/riscv64/generate_kernel.py
|
||||
Settings:
|
||||
LMUL=2
|
||||
M=8
|
||||
M_tail_scalar_from=2
|
||||
N=8
|
||||
__riscv_='__riscv_'
|
||||
complex=False
|
||||
conjugate=False
|
||||
cpu='zvl128b'
|
||||
force_acc_double=False
|
||||
index_type='BLASLONG'
|
||||
op='trmm'
|
||||
param_precision='float'
|
||||
reg_width_bits=128
|
||||
tail_policy=''
|
||||
trace=False
|
||||
|
||||
Derived:
|
||||
ELEN_ACC=32
|
||||
ELEN_PARAM=32
|
||||
LMUL_ACC=2
|
||||
VFMACC='__riscv_vfmacc_vf_f32m2'
|
||||
VFMUL='__riscv_vfmul_vf_f32m2'
|
||||
VLEV='__riscv_vle32_v_f32m2'
|
||||
VLSEV='__riscv_vlse32_v_f32m2'
|
||||
VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
|
||||
VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
|
||||
VSETVL='__riscv_vsetvl_e32m2'
|
||||
VSEV='__riscv_vse32_v_f32m2'
|
||||
VSSEV='__riscv_vsse32_v_f32m2'
|
||||
acc_vector_t='vfloat32m2_t'
|
||||
output='strmm_kernel_8x8_zvl128b.c'
|
||||
param_scalar_t='float'
|
||||
param_vector_t='vfloat32m2_t'
|
||||
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(LEFT) != defined(TRANSA)
|
||||
#define BACKWARDS
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
|
||||
|
||||
{
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
|
||||
// -- MAIN PASS
|
||||
|
||||
for (BLASLONG j = 0; j < N / 8; j += 1) {
|
||||
m_top = 0;
|
||||
BLASLONG gvl = __riscv_vsetvl_e32m2(8);
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 8;
|
||||
bi += off * 8;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 8;
|
||||
#else
|
||||
pass_K = off + 8;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
float B4 = B[bi + 4];
|
||||
float B5 = B[bi + 5];
|
||||
float B6 = B[bi + 6];
|
||||
float B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
|
||||
vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
|
||||
vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
|
||||
vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
B4 = B[bi + 4];
|
||||
B5 = B[bi + 5];
|
||||
B6 = B[bi + 6];
|
||||
B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
|
||||
vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
|
||||
vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
|
||||
vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
|
||||
vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
|
||||
vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
|
||||
vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c4, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c5, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c6, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c7, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
// -- tails for main pass
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 4;
|
||||
bi += off * 8;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 4;
|
||||
#else
|
||||
pass_K = off + 8;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
float B4 = B[bi + 4];
|
||||
float B5 = B[bi + 5];
|
||||
float B6 = B[bi + 6];
|
||||
float B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
|
||||
vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
|
||||
vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
|
||||
vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
B4 = B[bi + 4];
|
||||
B5 = B[bi + 5];
|
||||
B6 = B[bi + 6];
|
||||
B7 = B[bi + 7];
|
||||
bi += 8;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
|
||||
result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
|
||||
result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
|
||||
result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
|
||||
vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
|
||||
vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
|
||||
vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
|
||||
vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
|
||||
vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
|
||||
vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c4, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c5, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c6, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c7, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
float result8 = 0;
|
||||
float result9 = 0;
|
||||
float result10 = 0;
|
||||
float result11 = 0;
|
||||
float result12 = 0;
|
||||
float result13 = 0;
|
||||
float result14 = 0;
|
||||
float result15 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 2;
|
||||
bi += off * 8;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 2;
|
||||
#else
|
||||
pass_K = off + 8;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
result4 += A[ai + 0] * B[bi + 2];
|
||||
result5 += A[ai + 1] * B[bi + 2];
|
||||
result6 += A[ai + 0] * B[bi + 3];
|
||||
result7 += A[ai + 1] * B[bi + 3];
|
||||
result8 += A[ai + 0] * B[bi + 4];
|
||||
result9 += A[ai + 1] * B[bi + 4];
|
||||
result10 += A[ai + 0] * B[bi + 5];
|
||||
result11 += A[ai + 1] * B[bi + 5];
|
||||
result12 += A[ai + 0] * B[bi + 6];
|
||||
result13 += A[ai + 1] * B[bi + 6];
|
||||
result14 += A[ai + 0] * B[bi + 7];
|
||||
result15 += A[ai + 1] * B[bi + 7];
|
||||
ai += 2;
|
||||
bi += 8;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 0 * ldc + 1] = alpha * result1;
|
||||
C[ci + 1 * ldc + 0] = alpha * result2;
|
||||
C[ci + 1 * ldc + 1] = alpha * result3;
|
||||
C[ci + 2 * ldc + 0] = alpha * result4;
|
||||
C[ci + 2 * ldc + 1] = alpha * result5;
|
||||
C[ci + 3 * ldc + 0] = alpha * result6;
|
||||
C[ci + 3 * ldc + 1] = alpha * result7;
|
||||
C[ci + 4 * ldc + 0] = alpha * result8;
|
||||
C[ci + 4 * ldc + 1] = alpha * result9;
|
||||
C[ci + 5 * ldc + 0] = alpha * result10;
|
||||
C[ci + 5 * ldc + 1] = alpha * result11;
|
||||
C[ci + 6 * ldc + 0] = alpha * result12;
|
||||
C[ci + 6 * ldc + 1] = alpha * result13;
|
||||
C[ci + 7 * ldc + 0] = alpha * result14;
|
||||
C[ci + 7 * ldc + 1] = alpha * result15;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 1;
|
||||
bi += off * 8;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 1;
|
||||
#else
|
||||
pass_K = off + 8;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
result2 += A[ai + 0] * B[bi + 2];
|
||||
result3 += A[ai + 0] * B[bi + 3];
|
||||
result4 += A[ai + 0] * B[bi + 4];
|
||||
result5 += A[ai + 0] * B[bi + 5];
|
||||
result6 += A[ai + 0] * B[bi + 6];
|
||||
result7 += A[ai + 0] * B[bi + 7];
|
||||
ai += 1;
|
||||
bi += 8;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 1 * ldc + 0] = alpha * result1;
|
||||
C[ci + 2 * ldc + 0] = alpha * result2;
|
||||
C[ci + 3 * ldc + 0] = alpha * result3;
|
||||
C[ci + 4 * ldc + 0] = alpha * result4;
|
||||
C[ci + 5 * ldc + 0] = alpha * result5;
|
||||
C[ci + 6 * ldc + 0] = alpha * result6;
|
||||
C[ci + 7 * ldc + 0] = alpha * result7;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 8;
|
||||
}
|
||||
|
||||
// -- tails for N=4
|
||||
|
||||
if (N & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 8;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 8;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
|
||||
vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
|
||||
vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 4;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 4;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
float B2 = B[bi + 2];
|
||||
float B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
|
||||
vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
B2 = B[bi + 2];
|
||||
B3 = B[bi + 3];
|
||||
bi += 4;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
|
||||
result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
|
||||
vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
|
||||
vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c2, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c3, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
float result4 = 0;
|
||||
float result5 = 0;
|
||||
float result6 = 0;
|
||||
float result7 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 2;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 2;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
result4 += A[ai + 0] * B[bi + 2];
|
||||
result5 += A[ai + 1] * B[bi + 2];
|
||||
result6 += A[ai + 0] * B[bi + 3];
|
||||
result7 += A[ai + 1] * B[bi + 3];
|
||||
ai += 2;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 0 * ldc + 1] = alpha * result1;
|
||||
C[ci + 1 * ldc + 0] = alpha * result2;
|
||||
C[ci + 1 * ldc + 1] = alpha * result3;
|
||||
C[ci + 2 * ldc + 0] = alpha * result4;
|
||||
C[ci + 2 * ldc + 1] = alpha * result5;
|
||||
C[ci + 3 * ldc + 0] = alpha * result6;
|
||||
C[ci + 3 * ldc + 1] = alpha * result7;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 1;
|
||||
bi += off * 4;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 1;
|
||||
#else
|
||||
pass_K = off + 4;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
result2 += A[ai + 0] * B[bi + 2];
|
||||
result3 += A[ai + 0] * B[bi + 3];
|
||||
ai += 1;
|
||||
bi += 4;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 1 * ldc + 0] = alpha * result1;
|
||||
C[ci + 2 * ldc + 0] = alpha * result2;
|
||||
C[ci + 3 * ldc + 0] = alpha * result3;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 4;
|
||||
}
|
||||
|
||||
// -- tails for N=2
|
||||
|
||||
if (N & 2) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 8;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 8;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 4;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 4;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
float B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
B1 = B[bi + 1];
|
||||
bi += 2;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
ci += ldc - gvl * 0;
|
||||
__riscv_vse32_v_f32m2(&C[ci], c1, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
float result2 = 0;
|
||||
float result3 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 2;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 2;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
result2 += A[ai + 0] * B[bi + 1];
|
||||
result3 += A[ai + 1] * B[bi + 1];
|
||||
ai += 2;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 0 * ldc + 1] = alpha * result1;
|
||||
C[ci + 1 * ldc + 0] = alpha * result2;
|
||||
C[ci + 1 * ldc + 1] = alpha * result3;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 1;
|
||||
bi += off * 2;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 1;
|
||||
#else
|
||||
pass_K = off + 2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 0] * B[bi + 1];
|
||||
ai += 1;
|
||||
bi += 2;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 1 * ldc + 0] = alpha * result1;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 2;
|
||||
}
|
||||
|
||||
// -- tails for N=1
|
||||
|
||||
if (N & 1) {
|
||||
gvl = __riscv_vsetvl_e32m2(8);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 8;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 8;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 8;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
if (M & 4) {
|
||||
gvl = __riscv_vsetvl_e32m2(4);
|
||||
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 4;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 4;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
float B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
|
||||
|
||||
for (BLASLONG k = 1; k < pass_K; k++) {
|
||||
B0 = B[bi + 0];
|
||||
bi += 1;
|
||||
|
||||
A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
|
||||
ai += 4;
|
||||
|
||||
result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
|
||||
vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
|
||||
__riscv_vse32_v_f32m2(&C[ci], c0, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
if (M & 2) {
|
||||
float result0 = 0;
|
||||
float result1 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 2;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 2;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
result1 += A[ai + 1] * B[bi + 0];
|
||||
ai += 2;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
C[ci + 0 * ldc + 1] = alpha * result1;
|
||||
m_top += 2;
|
||||
}
|
||||
|
||||
if (M & 1) {
|
||||
float result0 = 0;
|
||||
BLASLONG ai = m_top * K;
|
||||
BLASLONG bi = n_top * K;
|
||||
BLASLONG pass_K = K;
|
||||
#ifdef LEFT
|
||||
BLASLONG off = offset + m_top;
|
||||
#else
|
||||
BLASLONG off = -offset + n_top;
|
||||
#endif
|
||||
#ifdef BACKWARDS
|
||||
ai += off * 1;
|
||||
bi += off * 1;
|
||||
pass_K -= off;
|
||||
#else
|
||||
#ifdef LEFT
|
||||
pass_K = off + 1;
|
||||
#else
|
||||
pass_K = off + 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (BLASLONG k = 0; k < pass_K; k++) {
|
||||
result0 += A[ai + 0] * B[bi + 0];
|
||||
ai += 1;
|
||||
bi += 1;
|
||||
}
|
||||
|
||||
BLASLONG ci = n_top * ldc + m_top;
|
||||
C[ci + 0 * ldc + 0] = alpha * result0;
|
||||
m_top += 1;
|
||||
}
|
||||
|
||||
n_top += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
95
kernel/riscv64/sum_rvv.c
Normal file
95
kernel/riscv64/sum_rvv.c
Normal file
@@ -0,0 +1,95 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
FLOAT_V_T vx, vsum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vsum = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax);
|
||||
sumf = VFMVFS_FLOAT_M1(v_res);
|
||||
return(sumf);
|
||||
}
|
||||
114
kernel/riscv64/sum_vector.c
Normal file
114
kernel/riscv64/sum_vector.c
Normal file
@@ -0,0 +1,114 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
|
||||
#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
|
||||
#else
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
|
||||
#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
|
||||
#endif
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT asumf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_sum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
inc_xv += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
asumf = EXTRACT_FLOAT(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
138
kernel/riscv64/swap_rvv.c
Normal file
138
kernel/riscv64/swap_rvv.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG stride_x, stride_y;
|
||||
FLOAT_V_T vx, vy;
|
||||
|
||||
if (n <= 0) return(0);
|
||||
|
||||
if (inc_x == 0 && inc_y == 0) {
|
||||
if (n & 1) {
|
||||
FLOAT temp = x[0];
|
||||
x[0] = y[0];
|
||||
y[0] = temp;
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 0) {
|
||||
FLOAT temp = x[0];
|
||||
x[0] = y[(n - 1) * inc_y];
|
||||
FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one
|
||||
stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse
|
||||
BLASLONG m = n - 1;
|
||||
for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) {
|
||||
vl = VSETVL(m);
|
||||
vy = VLSEV_FLOAT(ptr - 1, stride_y, vl);
|
||||
VSSEV_FLOAT(ptr, stride_y, vy, vl);
|
||||
}
|
||||
y[0] = temp;
|
||||
}
|
||||
else if(inc_y == 0) {
|
||||
FLOAT temp = y[0];
|
||||
y[0] = x[(n - 1) * inc_x];
|
||||
FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one
|
||||
stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse
|
||||
BLASLONG m = n - 1;
|
||||
for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) {
|
||||
vl = VSETVL(m);
|
||||
vx = VLSEV_FLOAT(ptr - 1, stride_x, vl);
|
||||
VSSEV_FLOAT(ptr, stride_x, vx, vl);
|
||||
}
|
||||
x[0] = temp;
|
||||
}
|
||||
else if(inc_x == 1 && inc_y == 1) {
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
VSEV_FLOAT(y, vx, vl);
|
||||
VSEV_FLOAT(x, vy, vl);
|
||||
}
|
||||
|
||||
} else if (inc_y == 1) {
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
VSEV_FLOAT(y, vx, vl);
|
||||
VSSEV_FLOAT(x, stride_x, vy, vl);
|
||||
}
|
||||
|
||||
} else if(inc_x == 1) {
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vx, vl);
|
||||
VSEV_FLOAT(x, vy, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vx, vl);
|
||||
VSSEV_FLOAT(x, stride_x, vy, vl);
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
@@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#include <stdio.h>
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 32
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 16
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL)
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i = 0, j = 0;
|
||||
BLASLONG ix = 0,iy = 0;
|
||||
BLASLONG stride_x, stride_y;
|
||||
FLOAT_V_T vx0, vx1, vy0, vy1;
|
||||
unsigned int gvl = 0;
|
||||
|
||||
if (n < 0) return(0);
|
||||
if (n <= 0) return(0);
|
||||
|
||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
|
||||
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
|
||||
|
||||
if(inc_x == 1 && inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
for(i=0,j=0; i<n/(2*gvl); i++){
|
||||
vx0 = VLEV_FLOAT(&x[j], gvl);
|
||||
@@ -79,7 +96,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||
j+=gvl;
|
||||
}
|
||||
}else if (inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
@@ -107,7 +123,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||
ix += inc_x * gvl;
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG inc_yv = inc_y * gvl;
|
||||
@@ -135,8 +150,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||
iy += inc_y * gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
|
||||
101
kernel/riscv64/symm_lcopy_rvv_v1.c
Normal file
101
kernel/riscv64/symm_lcopy_rvv_v1.c
Normal file
@@ -0,0 +1,101 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define INT_V_T vint32m2_t
|
||||
#define VID_V_INT __riscv_vid_v_i32m2
|
||||
#define VADD_VX_INT __riscv_vadd_vx_i32m2
|
||||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16
|
||||
#define VBOOL_T vbool16_t
|
||||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define INT_V_T vint64m2_t
|
||||
#define VID_V_INT __riscv_vid_v_i64m2
|
||||
#define VADD_VX_INT __riscv_vadd_vx_i64m2
|
||||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32
|
||||
#define VBOOL_T vbool32_t
|
||||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/symm_lcopy_4.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, js, offset;
|
||||
|
||||
FLOAT *ao1, *ao2;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T vb, va1, va2;
|
||||
VBOOL_T vbool;
|
||||
INT_V_T vindex_max, vindex;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
vindex_max = VID_V_INT(vl);
|
||||
|
||||
for (js = n; js > 0; js -= vl, posX += vl) {
|
||||
vl = VSETVL(js);
|
||||
offset = posX - posY;
|
||||
|
||||
ao1 = a + posX + posY * lda;
|
||||
ao2 = a + posY + (posX) * lda;
|
||||
|
||||
for (i = m; i > 0; i--, offset--) {
|
||||
va2 = VLSEV_FLOAT(ao2, stride_lda, vl);
|
||||
va1 = VLEV_FLOAT(ao1, vl);
|
||||
|
||||
// offset > (0 - vindex) ---> (offset + vindex) > 0
|
||||
vindex = VADD_VX_INT(vindex_max, offset, vl);
|
||||
vbool = VMSGT_VX_INT(vindex, 0, vl);
|
||||
|
||||
vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
|
||||
b += vl;
|
||||
ao1 += lda;
|
||||
ao2++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
100
kernel/riscv64/symm_ucopy_rvv_v1.c
Normal file
100
kernel/riscv64/symm_ucopy_rvv_v1.c
Normal file
@@ -0,0 +1,100 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define INT_V_T vint32m2_t
|
||||
#define VID_V_INT __riscv_vid_v_i32m2
|
||||
#define VADD_VX_INT __riscv_vadd_vx_i32m2
|
||||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16
|
||||
#define VBOOL_T vbool16_t
|
||||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define INT_V_T vint64m2_t
|
||||
#define VID_V_INT __riscv_vid_v_i64m2
|
||||
#define VADD_VX_INT __riscv_vadd_vx_i64m2
|
||||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32
|
||||
#define VBOOL_T vbool32_t
|
||||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/symm_ucopy_4.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, js, offset;
|
||||
|
||||
FLOAT *ao1, *ao2;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T vb, va1, va2;
|
||||
VBOOL_T vbool;
|
||||
INT_V_T vindex_max, vindex;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
vindex_max = VID_V_INT(vl);
|
||||
|
||||
for (js = n; js > 0; js -= vl, posX += vl) {
|
||||
vl = VSETVL(js);
|
||||
offset = posX - posY;
|
||||
|
||||
ao1 = a + posY + (posX + 0) * lda;
|
||||
ao2 = a + posX + 0 + posY * lda;
|
||||
|
||||
for (i = m; i > 0; i--, offset--) {
|
||||
va1 = VLSEV_FLOAT(ao1, stride_lda, vl);
|
||||
va2 = VLEV_FLOAT(ao2, vl);
|
||||
|
||||
// offset > (0 - vindex) ---> (offset + vindex) > 0
|
||||
vindex = VADD_VX_INT(vindex_max, offset, vl);
|
||||
vbool = VMSGT_VX_INT(vindex, 0, vl);
|
||||
|
||||
vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
|
||||
b += vl;
|
||||
ao1++;
|
||||
ao2 += lda;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
219
kernel/riscv64/symv_L_rvv.c
Normal file
219
kernel/riscv64/symv_L_rvv.c
Normal file
@@ -0,0 +1,219 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
FLOAT temp1;
|
||||
FLOAT *a_ptr = a;
|
||||
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
size_t vlmax = VSETVL_MAX_M1, vl;
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
vlmax = VSETVL_MAX;
|
||||
|
||||
FLOAT_V_T va, vx, vy, vr;
|
||||
BLASLONG stride_x, stride_y, inc_xv, inc_yv;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1)
|
||||
{
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
y[j] += temp1 * a_ptr[j];
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
|
||||
|
||||
y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 1)
|
||||
{
|
||||
jy = 0;
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
y[jy] += temp1 * a_ptr[j];
|
||||
iy = jy + inc_y;
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_yv = inc_y * vl;
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
|
||||
|
||||
y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
jy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else if(inc_y == 1)
|
||||
{
|
||||
jx = 0;
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
y[j] += temp1 * a_ptr[j];
|
||||
ix = jx + inc_x;
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_xv = inc_x * vl;
|
||||
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
|
||||
ix += inc_xv;
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
|
||||
|
||||
y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
jx += inc_x;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
jx = 0;
|
||||
jy = 0;
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
y[jy] += temp1 * a_ptr[j];
|
||||
ix = jx + inc_x;
|
||||
iy = jy + inc_y;
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_xv = inc_x * vl;
|
||||
inc_yv = inc_y * vl;
|
||||
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
|
||||
ix += inc_xv;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
|
||||
|
||||
y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
@@ -27,37 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMULVV_FLOAT vfmul_vv_f32m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
|
||||
#else
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMULVV_FLOAT vfmul_vv_f64m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
@@ -99,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
i += gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
@@ -110,8 +116,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += alpha * temp2;
|
||||
@@ -144,8 +150,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
i += gvl;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
@@ -155,8 +161,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
@@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
i += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
@@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += alpha * temp2;
|
||||
@@ -241,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
ix += inc_xv;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
@@ -252,8 +258,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
|
||||
216
kernel/riscv64/symv_U_rvv.c
Normal file
216
kernel/riscv64/symv_U_rvv.c
Normal file
@@ -0,0 +1,216 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
FLOAT temp1;
|
||||
FLOAT *a_ptr = a;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
size_t vl_max = VSETVL_MAX_M1, vl;
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vl_max);
|
||||
vl_max = VSETVL_MAX;
|
||||
|
||||
FLOAT_V_T va, vx, vy, vr;
|
||||
BLASLONG stride_x, stride_y, inc_xv, inc_yv;
|
||||
|
||||
BLASLONG m1 = m - offset;
|
||||
if(inc_x == 1 && inc_y == 1)
|
||||
{
|
||||
a_ptr += m1 * lda;
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
|
||||
|
||||
y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 1)
|
||||
{
|
||||
jy = m1 * inc_y;
|
||||
a_ptr += m1 * lda;
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
iy = 0;
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_yv = inc_y * vl;
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
|
||||
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
jy += inc_y;
|
||||
}
|
||||
}
|
||||
else if(inc_y == 1)
|
||||
{
|
||||
jx = m1 * inc_x;
|
||||
a_ptr += m1 * lda;
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
ix = 0;
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_xv = inc_x * vl;
|
||||
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
|
||||
|
||||
y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
jx += inc_x;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
jx = m1 * inc_x;
|
||||
jy = m1 * inc_y;
|
||||
a_ptr += m1 * lda;
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_xv = inc_x * vl;
|
||||
inc_yv = inc_y * vl;
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
|
||||
ix += inc_xv;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
|
||||
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
@@ -27,39 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4
|
||||
#define VFMULVV_FLOAT vfmul_vv_f32m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
|
||||
#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
|
||||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
|
||||
#else
|
||||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
|
||||
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4
|
||||
#define VFMULVV_FLOAT vfmul_vv_f64m4
|
||||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
|
||||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
|
||||
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
|
||||
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
|
||||
#endif
|
||||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
|
||||
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
|
||||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
|
||||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
|
||||
#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
|
||||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
@@ -101,8 +107,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
i += gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
@@ -112,8 +118,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
@@ -145,8 +151,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
i += gvl;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
@@ -156,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
@@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
i += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
@@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
@@ -240,8 +246,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
ix += inc_xv;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
@@ -251,8 +257,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
|
||||
138
kernel/riscv64/trmm_lncopy_rvv_v1.c
Normal file
138
kernel/riscv64/trmm_lncopy_rvv_v1.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, X;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T vb, va1;
|
||||
|
||||
size_t vl;
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else if (X < posY)
|
||||
{
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
|
||||
X += vl;
|
||||
i += vl;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
134
kernel/riscv64/trmm_ltcopy_rvv_v1.c
Normal file
134
kernel/riscv64/trmm_ltcopy_rvv_v1.c
Normal file
@@ -0,0 +1,134 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, X;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
FLOAT_V_T vb, va1;
|
||||
size_t vl;
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY)
|
||||
{
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else if (X < posY)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao += lda;
|
||||
b += vl;
|
||||
}
|
||||
X += vl;
|
||||
i += vl;
|
||||
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
136
kernel/riscv64/trmm_uncopy_rvv_v1.c
Normal file
136
kernel/riscv64/trmm_uncopy_rvv_v1.c
Normal file
@@ -0,0 +1,136 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, X;
|
||||
BLASLONG stride_lda = sizeof(FLOAT) * lda;
|
||||
FLOAT *ao;
|
||||
|
||||
FLOAT_V_T vb, va1;
|
||||
size_t vl;
|
||||
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else if (X > posY)
|
||||
{
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
|
||||
X += vl;
|
||||
i += vl;
|
||||
}
|
||||
}while (i < m);
|
||||
|
||||
posY += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
133
kernel/riscv64/trmm_utcopy_rvv_v1.c
Normal file
133
kernel/riscv64/trmm_utcopy_rvv_v1.c
Normal file
@@ -0,0 +1,133 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, j, js, X;
|
||||
|
||||
FLOAT *ao;
|
||||
FLOAT_V_T vb, va1;
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY)
|
||||
{
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i++;
|
||||
}
|
||||
else if (X > posY)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X++;
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao += lda;
|
||||
b += vl;
|
||||
}
|
||||
X += vl;
|
||||
i += vl;
|
||||
}
|
||||
}while (i < m);
|
||||
posY += vl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
685
kernel/riscv64/trmmkernel_rvv_v1x8.c
Normal file
685
kernel/riscv64/trmmkernel_rvv_v1x8.c
Normal file
@@ -0,0 +1,685 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
// Optimizes the implementation in ../generic/trmmkernel_8x8.c
|
||||
|
||||
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
|
||||
{
|
||||
//fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc);
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
|
||||
|
||||
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
|
||||
size_t vl;
|
||||
|
||||
BLASLONG off, temp;
|
||||
|
||||
#if !defined(LEFT)
|
||||
off = -offset;
|
||||
#else
|
||||
off = 0;
|
||||
#endif
|
||||
for (j = bn/8; j > 0; j--)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
C2 = C1+ldc;
|
||||
C3 = C2+ldc;
|
||||
C4 = C3+ldc;
|
||||
C5 = C4+ldc;
|
||||
C6 = C5+ldc;
|
||||
C7 = C6+ldc;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*8;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres4 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+8; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
ptrbb += 8;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
|
||||
ptrbb += 8;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
|
||||
ptrbb += 8;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
|
||||
ptrbb += 8;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
|
||||
ptrbb += 8;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
|
||||
ptrbb += 8;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
|
||||
ptrbb += 8;
|
||||
}
|
||||
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
|
||||
ptrbb += 8;
|
||||
ptrba += vl;
|
||||
}
|
||||
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VFMULVF_FLOAT(vres2, alpha, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VFMULVF_FLOAT(vres3, alpha, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
va4 = VFMULVF_FLOAT(vres4, alpha, vl);
|
||||
VSEV_FLOAT(C4, va4, vl);
|
||||
|
||||
va5 = VFMULVF_FLOAT(vres5, alpha, vl);
|
||||
VSEV_FLOAT(C5, va5, vl);
|
||||
|
||||
va6 = VFMULVF_FLOAT(vres6, alpha, vl);
|
||||
VSEV_FLOAT(C6, va6, vl);
|
||||
|
||||
va7 = VFMULVF_FLOAT(vres7, alpha, vl);
|
||||
VSEV_FLOAT(C7, va7, vl);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 8; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*8;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
C4 += vl;
|
||||
C5 += vl;
|
||||
C6 += vl;
|
||||
C7 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 8;
|
||||
#endif
|
||||
|
||||
bb += (bk<<3);
|
||||
C += (ldc<<3);
|
||||
}
|
||||
|
||||
if (bn & 4)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
C2 = C1+ldc;
|
||||
C3 = C2+ldc;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*4;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+4; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
ptrbb += 4;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
ptrbb += 4;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
ptrbb += 4;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
ptrbb += 4;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
ptrbb += 4;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
ptrbb += 4;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
ptrbb += 4;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
ptrbb += 4;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
|
||||
ptrbb += 4;
|
||||
ptrba += vl;
|
||||
}
|
||||
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VFMULVF_FLOAT(vres2, alpha, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VFMULVF_FLOAT(vres3, alpha, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 4; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*4;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4;
|
||||
#endif
|
||||
|
||||
bb += (bk<<2);
|
||||
C += (ldc<<2);
|
||||
}
|
||||
|
||||
if (bn & 2)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
ptrbb += 2;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
ptrbb += 2;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
ptrbb += 2;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
ptrbb += 2;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
ptrbb += 2;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
ptrbb += 2;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
ptrbb += 2;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
ptrbb += 2;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
|
||||
ptrbb += 2;
|
||||
ptrba += vl;
|
||||
}
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
bb += (bk<<1);
|
||||
C += (ldc<<1);
|
||||
}
|
||||
|
||||
if (bn & 1)
|
||||
{
|
||||
C0 = C;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*1;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
ptrbb += 1;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
ptrbb += 1;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
ptrbb += 1;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
ptrbb += 1;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
ptrbb += 1;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
ptrbb += 1;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
ptrbb += 1;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
ptrbb += 1;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
|
||||
ptrbb += 1;
|
||||
ptrba += vl;
|
||||
}
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*1;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1;
|
||||
#endif
|
||||
|
||||
bb += (bk);
|
||||
C += (ldc);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
364
kernel/riscv64/trsm_kernel_LN_rvv_v1.c
Normal file
364
kernel/riscv64/trsm_kernel_LN_rvv_v1.c
Normal file
@@ -0,0 +1,364 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define FLOAT_VX2_T vfloat32m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2
|
||||
#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2
|
||||
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2
|
||||
#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define FLOAT_VX2_T vfloat64m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2
|
||||
#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2
|
||||
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2
|
||||
#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_L
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
FLOAT aa;
|
||||
FLOAT* pc;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
|
||||
FLOAT_V_T vb, vc;
|
||||
|
||||
size_t vl;
|
||||
|
||||
a += (m - 1) * m;
|
||||
b += (m - 1) * n;
|
||||
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
|
||||
aa = *(a + i);
|
||||
pc = c;
|
||||
for (j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
|
||||
vb = VFMULVF_FLOAT(vb, aa, vl);
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
|
||||
b += vl;
|
||||
|
||||
for (k = 0; k < i; k ++) {
|
||||
vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
|
||||
VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
|
||||
}
|
||||
pc += vl * ldc;
|
||||
}
|
||||
a -= m;
|
||||
b -= 2 * n;
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT *pc;
|
||||
int i, j, k;
|
||||
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
|
||||
|
||||
FLOAT_VX2_T vbx2, vsx2, vcx2;
|
||||
FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
|
||||
size_t vl;
|
||||
a += (m - 1) * m * 2;
|
||||
b += (m - 1) * n * 2;
|
||||
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
|
||||
aa1 = *(a + i * 2 + 0);
|
||||
aa2 = *(a + i * 2 + 1);
|
||||
pc = c;
|
||||
|
||||
for (j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
|
||||
vb1 = VGET_VX2(vbx2, 0);
|
||||
vb2 = VGET_VX2(vbx2, 1);
|
||||
#ifndef CONJ
|
||||
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
|
||||
vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
|
||||
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
|
||||
vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
|
||||
#else
|
||||
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
|
||||
vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
|
||||
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
|
||||
vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
|
||||
#endif
|
||||
vsx2 = VSET_VX2(vsx2, 0, vs1);
|
||||
vsx2 = VSET_VX2(vsx2, 1, vs2);
|
||||
VSSEG2_FLOAT(b, vsx2, vl);
|
||||
VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
|
||||
b += vl * 2;
|
||||
|
||||
for (k = 0; k < i; k ++) {
|
||||
vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
|
||||
vc1 = VGET_VX2(vcx2, 0);
|
||||
vc2 = VGET_VX2(vcx2, 1);
|
||||
#ifndef CONJ
|
||||
vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
|
||||
#else
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
|
||||
vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
|
||||
#endif
|
||||
vcx2 = VSET_VX2(vcx2, 0, vc1);
|
||||
vcx2 = VSET_VX2(vcx2, 1, vc2);
|
||||
VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
|
||||
}
|
||||
pc += vl * ldc * 2;
|
||||
}
|
||||
a -= m * 2;
|
||||
b -= 4 * n;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
BLASLONG i, j;
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
kk = m + offset;
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
aa = a + (m - i) * k * COMPSIZE;
|
||||
cc = c + (m - i) * COMPSIZE;
|
||||
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + (kk - i) * i * COMPSIZE,
|
||||
b + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
kk -= i;
|
||||
|
||||
}
|
||||
|
||||
int mod = i;
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
aa = a + (m - mod - vl) * k * COMPSIZE;
|
||||
cc = c + (m - mod - vl) * COMPSIZE;
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + (kk - vl) * vl * COMPSIZE,
|
||||
b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa -= vl * k * COMPSIZE;
|
||||
cc -= vl * COMPSIZE;
|
||||
kk -= vl;
|
||||
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
kk = m + offset;
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
aa = a + (m - i) * k * COMPSIZE;
|
||||
cc = c + (m - i) * COMPSIZE;
|
||||
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc, ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + (kk - i) * i * COMPSIZE,
|
||||
b + (kk - i) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
kk -= i;
|
||||
|
||||
}
|
||||
|
||||
int mod = i;
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
aa = a + (m - mod - vl) * k * COMPSIZE;
|
||||
cc = c + (m - mod - vl) * COMPSIZE;
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + (kk - vl) * vl * COMPSIZE,
|
||||
b + (kk - vl) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa -= vl * k * COMPSIZE;
|
||||
cc -= vl * COMPSIZE;
|
||||
kk -= vl;
|
||||
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
341
kernel/riscv64/trsm_kernel_LT_rvv_v1.c
Normal file
341
kernel/riscv64/trsm_kernel_LT_rvv_v1.c
Normal file
@@ -0,0 +1,341 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define FLOAT_VX2_T vfloat32m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2
|
||||
#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2
|
||||
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2
|
||||
#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define FLOAT_VX2_T vfloat64m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2
|
||||
#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2
|
||||
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2
|
||||
#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_L
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa;
|
||||
FLOAT* pc;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
|
||||
FLOAT_V_T vb, vc;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
aa = *(a + i);
|
||||
pc = c;
|
||||
for (j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
|
||||
vb = VFMULVF_FLOAT(vb, aa, vl);
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
|
||||
b += vl;
|
||||
|
||||
for (k = i + 1; k < m; k++) {
|
||||
vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
|
||||
VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
|
||||
}
|
||||
pc += vl * ldc;
|
||||
}
|
||||
a += m;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT *pc;
|
||||
int i, j, k;
|
||||
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
|
||||
|
||||
FLOAT_VX2_T vbx2, vsx2, vcx2;
|
||||
FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
|
||||
size_t vl;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
aa1 = *(a + i * 2 + 0);
|
||||
aa2 = *(a + i * 2 + 1);
|
||||
pc = c;
|
||||
|
||||
for (j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
|
||||
vb1 = VGET_VX2(vbx2, 0);
|
||||
vb2 = VGET_VX2(vbx2, 1);
|
||||
#ifndef CONJ
|
||||
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
|
||||
vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
|
||||
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
|
||||
vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
|
||||
#else
|
||||
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
|
||||
vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
|
||||
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
|
||||
vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
|
||||
#endif
|
||||
vsx2 = VSET_VX2(vsx2, 0, vs1);
|
||||
vsx2 = VSET_VX2(vsx2, 1, vs2);
|
||||
VSSEG2_FLOAT(b, vsx2, vl);
|
||||
VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
|
||||
b += vl * 2;
|
||||
|
||||
for (k = i + 1; k < m; k++) {
|
||||
vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
|
||||
vc1 = VGET_VX2(vcx2, 0);
|
||||
vc2 = VGET_VX2(vcx2, 1);
|
||||
#ifndef CONJ
|
||||
vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
|
||||
#else
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
|
||||
vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
|
||||
#endif
|
||||
vcx2 = VSET_VX2(vcx2, 0, vc1);
|
||||
vcx2 = VSET_VX2(vcx2, 1, vc2);
|
||||
VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
|
||||
}
|
||||
pc += vl * ldc * 2;
|
||||
}
|
||||
|
||||
a += m * 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
BLASLONG i, j;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
while (i <= m) {
|
||||
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
kk += vl;
|
||||
i += vl;
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
kk += i;
|
||||
|
||||
}
|
||||
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
while (i <= m) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
kk += vl;
|
||||
i += vl;
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
kk += i;
|
||||
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
337
kernel/riscv64/trsm_kernel_RN_rvv_v1.c
Normal file
337
kernel/riscv64/trsm_kernel_RN_rvv_v1.c
Normal file
@@ -0,0 +1,337 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define FLOAT_VX2_T vfloat32m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2
|
||||
#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define FLOAT_VX2_T vfloat64m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2
|
||||
#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2
|
||||
#endif
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_R
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT bb;
|
||||
FLOAT *pci, *pcj;
|
||||
|
||||
int i, j, k;
|
||||
FLOAT_V_T va, vc;
|
||||
|
||||
size_t vl;
|
||||
for (i = 0; i < n; i++) {
|
||||
|
||||
bb = *(b + i);
|
||||
pci = c + i * ldc;
|
||||
pcj = c;
|
||||
for (j = m; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
va = VLEV_FLOAT(pci, vl);
|
||||
va = VFMULVF_FLOAT(va, bb, vl);
|
||||
VSEV_FLOAT(a, va, vl);
|
||||
VSEV_FLOAT(pci, va, vl);
|
||||
a += vl;
|
||||
pci += vl;
|
||||
for (k = i + 1; k < n; k ++){
|
||||
vc = VLEV_FLOAT(pcj + k * ldc, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
|
||||
VSEV_FLOAT(pcj + k * ldc, vc, vl);
|
||||
}
|
||||
pcj += vl;
|
||||
}
|
||||
b += n;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT bb1, bb2;
|
||||
|
||||
FLOAT *pci, *pcj;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
FLOAT_VX2_T vax2, vsx2, vcx2;
|
||||
FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
|
||||
bb1 = *(b + i * 2 + 0);
|
||||
bb2 = *(b + i * 2 + 1);
|
||||
|
||||
pci = c + i * ldc * 2;
|
||||
pcj = c;
|
||||
|
||||
for (j = m; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
vax2 = VLSEG2_FLOAT(pci, vl);
|
||||
va1 = VGET_VX2(vax2, 0);
|
||||
va2 = VGET_VX2(vax2, 1);
|
||||
#ifndef CONJ
|
||||
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
|
||||
vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
|
||||
vs2 = VFMULVF_FLOAT(va1, bb2, vl);
|
||||
vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl);
|
||||
#else
|
||||
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
|
||||
vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl);
|
||||
vs2 = VFMULVF_FLOAT(va2, bb1, vl);
|
||||
vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
|
||||
#endif
|
||||
vsx2 = VSET_VX2(vsx2, 0, vs1);
|
||||
vsx2 = VSET_VX2(vsx2, 1, vs2);
|
||||
VSSEG2_FLOAT(a, vsx2, vl);
|
||||
VSSEG2_FLOAT(pci, vsx2, vl);
|
||||
a += vl * 2;
|
||||
pci += vl * 2;
|
||||
|
||||
for (k = i + 1; k < n; k ++){
|
||||
vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
|
||||
vc1 = VGET_VX2(vcx2, 0);
|
||||
vc2 = VGET_VX2(vcx2, 1);
|
||||
#ifndef CONJ
|
||||
vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
|
||||
#else
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
|
||||
vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
|
||||
#endif
|
||||
vcx2 = VSET_VX2(vcx2, 0, vc1);
|
||||
vcx2 = VSET_VX2(vcx2, 1, vc2);
|
||||
VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
|
||||
}
|
||||
pcj += vl * 2;
|
||||
}
|
||||
b += n * 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
BLASLONG i, j;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
kk = -offset;
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
if (i <= m) {
|
||||
do {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
kk += GEMM_UNROLL_N;
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
while (i <= m) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
kk += j;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
356
kernel/riscv64/trsm_kernel_RT_rvv_v1.c
Normal file
356
kernel/riscv64/trsm_kernel_RT_rvv_v1.c
Normal file
@@ -0,0 +1,356 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define FLOAT_VX2_T vfloat32m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2
|
||||
#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define FLOAT_VX2_T vfloat64m2x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2
|
||||
#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2
|
||||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2
|
||||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_R
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT bb;
|
||||
FLOAT *pci, *pcj;
|
||||
|
||||
int i, j, k;
|
||||
FLOAT_V_T va, vc;
|
||||
|
||||
size_t vl;
|
||||
|
||||
a += (n - 1) * m;
|
||||
b += (n - 1) * n;
|
||||
|
||||
for (i = n - 1; i >= 0; i--) {
|
||||
|
||||
bb = *(b + i);
|
||||
pci = c + i * ldc;
|
||||
pcj = c;
|
||||
for (j = m; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
va = VLEV_FLOAT(pci, vl);
|
||||
va = VFMULVF_FLOAT(va, bb, vl);
|
||||
VSEV_FLOAT(a, va, vl);
|
||||
VSEV_FLOAT(pci, va, vl);
|
||||
a += vl;
|
||||
pci += vl;
|
||||
for (k = 0; k < i; k ++){
|
||||
vc = VLEV_FLOAT(pcj + k * ldc, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
|
||||
VSEV_FLOAT(pcj + k * ldc, vc, vl);
|
||||
}
|
||||
pcj += vl;
|
||||
}
|
||||
b -= n;
|
||||
a -= 2 * m;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT bb1, bb2;
|
||||
|
||||
FLOAT *pci, *pcj;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
FLOAT_VX2_T vax2, vsx2, vcx2;
|
||||
FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
|
||||
|
||||
size_t vl;
|
||||
|
||||
a += (n - 1) * m * 2;
|
||||
b += (n - 1) * n * 2;
|
||||
|
||||
for (i = n - 1; i >= 0; i--) {
|
||||
|
||||
bb1 = *(b + i * 2 + 0);
|
||||
bb2 = *(b + i * 2 + 1);
|
||||
|
||||
pci = c + i * ldc * 2;
|
||||
pcj = c;
|
||||
for (j = m; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
vax2 = VLSEG2_FLOAT(pci, vl);
|
||||
va1 = VGET_VX2(vax2, 0);
|
||||
va2 = VGET_VX2(vax2, 1);
|
||||
#ifndef CONJ
|
||||
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
|
||||
vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
|
||||
vs2 = VFMULVF_FLOAT(va1, bb2, vl);
|
||||
vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl);
|
||||
#else
|
||||
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
|
||||
vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl);
|
||||
vs2 = VFMULVF_FLOAT(va2, bb1, vl);
|
||||
vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
|
||||
#endif
|
||||
vsx2 = VSET_VX2(vsx2, 0, vs1);
|
||||
vsx2 = VSET_VX2(vsx2, 1, vs2);
|
||||
VSSEG2_FLOAT(a, vsx2, vl);
|
||||
VSSEG2_FLOAT(pci, vsx2, vl);
|
||||
a += vl * 2;
|
||||
pci += vl * 2;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
|
||||
vc1 = VGET_VX2(vcx2, 0);
|
||||
vc2 = VGET_VX2(vcx2, 1);
|
||||
#ifndef CONJ
|
||||
vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
|
||||
#else
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
|
||||
vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
|
||||
#endif
|
||||
vcx2 = VSET_VX2(vcx2, 0, vc1);
|
||||
vcx2 = VSET_VX2(vcx2, 1, vc2);
|
||||
VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
|
||||
}
|
||||
pcj += vl * 2;
|
||||
}
|
||||
b -= n * 2;
|
||||
a -= 4 * m;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
BLASLONG i, j;
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
kk = n - offset;
|
||||
c += n * ldc * COMPSIZE;
|
||||
b += n * k * COMPSIZE;
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = 1;
|
||||
while (j < GEMM_UNROLL_N) {
|
||||
if (n & j) {
|
||||
|
||||
aa = a;
|
||||
b -= j * k * COMPSIZE;
|
||||
c -= j * ldc* COMPSIZE;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + (kk - j) * vl * COMPSIZE,
|
||||
b + (kk - j) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc, ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + (kk - j) * i * COMPSIZE,
|
||||
b + (kk - j) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
kk -= j;
|
||||
}
|
||||
j <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
if (j > 0) {
|
||||
|
||||
do {
|
||||
aa = a;
|
||||
b -= GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c -= GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE,
|
||||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE,
|
||||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
kk -= GEMM_UNROLL_N;
|
||||
j --;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
122
kernel/riscv64/trsm_lncopy_rvv_v1.c
Normal file
122
kernel/riscv64/trsm_lncopy_rvv_v1.c
Normal file
@@ -0,0 +1,122 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T va1;
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
|
||||
*(b + j) = INV(*(ao + j * lda));
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii > jj)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao++;
|
||||
b += vl;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl * lda;
|
||||
jj += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
122
kernel/riscv64/trsm_ltcopy_rvv_v1.c
Normal file
122
kernel/riscv64/trsm_ltcopy_rvv_v1.c
Normal file
@@ -0,0 +1,122 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32
|
||||
#endif
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
|
||||
FLOAT_V_T va1;
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
*(b + j) = INV(*(ao + j));
|
||||
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
|
||||
b += vl;
|
||||
ao += lda;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii < jj)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao += lda;
|
||||
b += vl;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl;
|
||||
jj += vl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
121
kernel/riscv64/trsm_uncopy_rvv_v1.c
Normal file
121
kernel/riscv64/trsm_uncopy_rvv_v1.c
Normal file
@@ -0,0 +1,121 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT *ao;
|
||||
jj = offset;
|
||||
|
||||
FLOAT_V_T va1;
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
*(b + j) = INV(*(ao + j * lda));
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii < jj)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao++;
|
||||
b += vl;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl * lda;
|
||||
jj += vl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
123
kernel/riscv64/trsm_utcopy_rvv_v1.c
Normal file
123
kernel/riscv64/trsm_utcopy_rvv_v1.c
Normal file
@@ -0,0 +1,123 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT __riscv_vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
FLOAT_V_T va1;
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
*(b + j) = INV(*(ao + j));
|
||||
|
||||
ao += lda;
|
||||
b += vl;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii > jj)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao += lda;
|
||||
b += vl;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl;
|
||||
jj += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
124
kernel/riscv64/zamax_rvv.c
Normal file
124
kernel/riscv64/zamax_rvv.c
Normal file
@@ -0,0 +1,124 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_VX2_T vfloat32m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_VX2_T vfloat64m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2
|
||||
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
FLOAT_V_T v0, v1, vmax;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
FLOAT_VX2_T vx2;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmax = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx2 = VLSEG_FLOAT(x, vl);
|
||||
|
||||
v0 = VGET_VX2(vx2, 0);
|
||||
v1 = VGET_VX2(vx2, 1);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx2 = VLSSEG_FLOAT(x, stride_x, vl);
|
||||
|
||||
v0 = VGET_VX2(vx2, 0);
|
||||
v1 = VGET_VX2(vx2, 1);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(maxf);
|
||||
}
|
||||
@@ -28,40 +28,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 32
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 16
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _)
|
||||
#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _)
|
||||
#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
@@ -70,10 +78,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_max;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
@@ -83,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
for(; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, gvl);
|
||||
v1 = VFABSV_FLOAT(v1, gvl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
|
||||
@@ -94,22 +99,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
|
||||
|
||||
if(j<n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v0 = VFABSV_FLOAT(v0, gvl);
|
||||
v1 = VFABSV_FLOAT(v1, gvl);
|
||||
v1 = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
|
||||
|
||||
if(VFMVFS_FLOAT(v_res)> maxf)
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
|
||||
}
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
123
kernel/riscv64/zamin_rvv.c
Normal file
123
kernel/riscv64/zamin_rvv.c
Normal file
@@ -0,0 +1,123 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_VX2_T vfloat32m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_VX2_T vfloat64m4x2_t
|
||||
#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4
|
||||
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2
|
||||
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2
|
||||
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu
|
||||
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m4
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
FLOAT_V_T v0, v1, vmin;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
FLOAT_VX2_T vx2;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx2 = VLSEG_FLOAT(x, vl);
|
||||
|
||||
v0 = VGET_VX2(vx2, 0);
|
||||
v1 = VGET_VX2(vx2, 1);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx2 = VLSSEG_FLOAT(x, stride_x, vl);
|
||||
|
||||
v0 = VGET_VX2(vx2, 0);
|
||||
v1 = VGET_VX2(vx2, 1);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(minf);
|
||||
}
|
||||
@@ -29,38 +29,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 32
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 16
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _)
|
||||
#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _)
|
||||
#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
@@ -69,10 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
FLOAT minf=FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_min;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
@@ -82,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
for(; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, gvl);
|
||||
v1 = VFABSV_FLOAT(v1, gvl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
|
||||
@@ -93,21 +101,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
|
||||
|
||||
if(j<n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v0 = VFABSV_FLOAT(v0, gvl);
|
||||
v1 = VFABSV_FLOAT(v1, gvl);
|
||||
v1 = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
|
||||
if(VFMVFS_FLOAT(v_res) < minf)
|
||||
minf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
|
||||
}
|
||||
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
return(minf);
|
||||
}
|
||||
|
||||
107
kernel/riscv64/zasum_rvv.c
Normal file
107
kernel/riscv64/zasum_rvv.c
Normal file
@@ -0,0 +1,107 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
||||
#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f32m8
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
||||
#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu
|
||||
#define VFABSV_FLOAT __riscv_vfabs_v_f64m8
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT asumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
|
||||
FLOAT_V_T v0, v1;
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
v1 = VLEV_FLOAT(x+vl, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
|
||||
v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
|
||||
int stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
v1 = VLSEV_FLOAT(x+1, stride_x, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
|
||||
v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax);
|
||||
asumf += VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(asumf);
|
||||
}
|
||||
@@ -28,37 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#ifdef RISCV64_ZVL256B
|
||||
# define LMUL m2
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN _b32
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN _b16
|
||||
# endif
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
# define LMUL m8
|
||||
# if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN _b8
|
||||
# else
|
||||
# define ELEN 32
|
||||
# define MLEN _b4
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _)
|
||||
#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
@@ -67,12 +77,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_zero,v_sum;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
if(inc_x == 1){
|
||||
BLASLONG n2 = n * 2;
|
||||
gvl = VSETVL(n2);
|
||||
@@ -81,26 +88,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n2/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n2;){
|
||||
gvl = VSETVL(n2-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
@@ -112,34 +114,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
|
||||
if(j<n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
|
||||
}
|
||||
}
|
||||
asumf = EXTRACT_FLOAT(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user