diff --git a/Makefile.prebuild b/Makefile.prebuild index c4f4a2602..b56169da0 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif +ifeq ($(TARGET), RISCV64_ZVL128B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + ifeq ($(TARGET), RISCV64_GENERIC) TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce7a27141..93e270bde 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +ifeq ($(CORE), RISCV64_ZVL128B) +CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static diff --git a/TargetList.txt b/TargetList.txt index f76f605cc..5b7a63831 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -119,6 +119,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) +RISCV64_ZVL128B C910V x280 diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 1b6b62f21..15a539c20 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 #define CPU_RISCV64_ZVL256B 2 +#define CPU_RISCV64_ZVL128B 3 static char *cpuname[] = { "RISCV64_GENERIC", "C910V", - "CPU_RISCV64_ZVL256B" + "CPU_RISCV64_ZVL256B", + "CPU_RISCV64_ZVL128B" }; int detect(void){ diff --git a/getarch.c b/getarch.c index 772836347..b8b7ef7e0 100644 --- a/getarch.c +++ b/getarch.c @@ -1691,7 +1691,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "x280" #else #endif - +#ifdef FORCE_RISCV64_ZVL128B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL128B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL128B " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl128b" +#define CORENAME "RISCV64_ZVL128B" +#endif #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B new file mode 100644 index 000000000..fec69ee09 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -0,0 +1,243 @@ +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c +SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c + +DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c +DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c + +CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c +CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + +ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c +ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + +CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c +CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + +ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c +ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..bd615389c --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,996 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='cgemm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..3268cb810 --- /dev/null +++ b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,1102 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='ctrmm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..a613f0bce --- /dev/null +++ b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,492 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dgemm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..c1e0da86e --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,660 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dtrmm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c new file mode 100644 index 000000000..ad720e694 --- /dev/null +++ b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c @@ -0,0 +1,791 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='sgemm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + C[ci + 4 * ldc + 0] += alpha * result8; + C[ci + 4 * ldc + 1] += alpha * result9; + C[ci + 5 * ldc + 0] += alpha * result10; + C[ci + 5 * ldc + 1] += alpha * result11; + C[ci + 6 * ldc + 0] += alpha * result12; + C[ci + 6 * ldc + 1] += alpha * result13; + C[ci + 7 * ldc + 0] += alpha * result14; + C[ci + 7 * ldc + 1] += alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + C[ci + 4 * ldc + 0] += alpha * result4; + C[ci + 5 * ldc + 0] += alpha * result5; + C[ci + 6 * ldc + 0] += alpha * result6; + C[ci + 7 * ldc + 0] += alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/strmm_kernel_8x8_zvl128b.c b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c new file mode 100644 index 000000000..ef18f036c --- /dev/null +++ b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c @@ -0,0 +1,991 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='strmm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + C[ci + 4 * ldc + 0] = alpha * result8; + C[ci + 4 * ldc + 1] = alpha * result9; + C[ci + 5 * ldc + 0] = alpha * result10; + C[ci + 5 * ldc + 1] = alpha * result11; + C[ci + 6 * ldc + 0] = alpha * result12; + C[ci + 6 * ldc + 1] = alpha * result13; + C[ci + 7 * ldc + 0] = alpha * result14; + C[ci + 7 * ldc + 1] = alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + C[ci + 4 * ldc + 0] = alpha * result4; + C[ci + 5 * ldc + 0] = alpha * result5; + C[ci + 6 * ldc + 0] = alpha * result6; + C[ci + 7 * ldc + 0] = alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..0776f03fd --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c @@ -0,0 +1,720 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='zgemm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..d7d5e5fea --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c @@ -0,0 +1,805 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='ztrmm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/param.h b/param.h index c5c70b78e..a1a70400c 100644 --- a/param.h +++ b/param.h @@ -3123,6 +3123,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef RISCV64_ZVL128B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef RISCV64_ZVL256B #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0