OpenBLAS/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c

1254 lines
55 KiB
C

/*
AUTOGENERATED KERNEL
Settings:
LMUL=1
M=8
M_tail_scalar_from=1
N=4
__riscv_='__riscv_'
complex=True
conjugate=False
cpu='zvl256b'
force_acc_double=False
index_type='BLASLONG'
op='gemm'
param_precision='double'
reg_width_bits=256
tail_policy=''
trace=False
Derived:
ELEN_ACC=64
ELEN_PARAM=64
LMUL_ACC=1
VFMACC='__riscv_vfmacc_vf_f64m1'
VFMUL='__riscv_vfmul_vf_f64m1'
VLEV='__riscv_vle64_v_f64m1'
VLSEV='__riscv_vlse64_v_f64m1'
VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
VSETVL='__riscv_vsetvl_e64m1'
VSEV='__riscv_vse64_v_f64m1'
VSSEV='__riscv_vsse64_v_f64m1'
acc_vector_t='vfloat64m1_t'
output='zgemm_kernel_8x4_zvl256b.c'
param_scalar_t='double'
param_vector_t='vfloat64m1_t'
*/
#include "common.h"
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define S0 1
#define S1 -1
#define S2 1
#define S3 1
#define VFMACC_RR __riscv_vfmsac
#define VFMACC_RI __riscv_vfmacc
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define S0 1
#define S1 1
#define S2 1
#define S3 -1
#define VFMACC_RR __riscv_vfmacc
#define VFMACC_RI __riscv_vfmsac
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define S0 1
#define S1 1
#define S2 -1
#define S3 1
#define VFMACC_RR __riscv_vfmacc
#define VFMACC_RI __riscv_vfnmsac
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define S0 1
#define S1 -1
#define S2 -1
#define S3 -1
#define VFMACC_RR __riscv_vfmsac
#define VFMACC_RI __riscv_vfnmacc
#endif
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
{
BLASLONG gvl = 0;
BLASLONG m_top = 0;
BLASLONG n_top = 0;
// -- MAIN PASS
for (BLASLONG j=0; j<N/4; j+=1) {
m_top = 0;
BLASLONG gvl = __riscv_vsetvl_e64m1(4);
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
double B1r = B[bi+1*2+0];
double B1i = B[bi+1*2+1];
double B2r = B[bi+2*2+0];
double B2i = B[bi+2*2+1];
double B3r = B[bi+3*2+0];
double B3i = B[bi+3*2+1];
bi += 4*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 8*2;
// 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
// leaving 12 vector registers for temporaries
// performing 4 operations between reuses of temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
vfloat64m1_t ACC1r = tmp1r;
vfloat64m1_t ACC1i = tmp1i;
vfloat64m1_t ACC2r = tmp2r;
vfloat64m1_t ACC2i = tmp2i;
vfloat64m1_t ACC3r = tmp3r;
vfloat64m1_t ACC3i = tmp3i;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
vfloat64m1_t ACC4r = tmp0r;
vfloat64m1_t ACC4i = tmp0i;
vfloat64m1_t ACC5r = tmp1r;
vfloat64m1_t ACC5i = tmp1i;
vfloat64m1_t ACC6r = tmp2r;
vfloat64m1_t ACC6i = tmp2i;
vfloat64m1_t ACC7r = tmp3r;
vfloat64m1_t ACC7i = tmp3i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
B1r = B[bi+1*2+0];
B1i = B[bi+1*2+1];
B2r = B[bi+2*2+0];
B2i = B[bi+2*2+1];
B3r = B[bi+3*2+0];
B3i = B[bi+3*2+1];
bi += 4*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 8*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += gvl;
vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*1;
vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += gvl;
vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*1;
vfloat64m1_t C4r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C4i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += gvl;
vfloat64m1_t C5r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C5i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*1;
vfloat64m1_t C6r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C6i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += gvl;
vfloat64m1_t C7r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C7i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
ci += gvl;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
ci += ldc-gvl*1;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
ci += gvl;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
ci += ldc-gvl*1;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
ci += gvl;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
ci += ldc-gvl*1;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
ci += gvl;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
m_top += 8;
}
// -- tails for main pass
if( M & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
double B1r = B[bi+1*2+0];
double B1i = B[bi+1*2+1];
double B2r = B[bi+2*2+0];
double B2i = B[bi+2*2+1];
double B3r = B[bi+3*2+0];
double B3i = B[bi+3*2+1];
bi += 4*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 4*2;
// 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
// leaving 22 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
vfloat64m1_t ACC1r = tmp1r;
vfloat64m1_t ACC1i = tmp1i;
vfloat64m1_t ACC2r = tmp2r;
vfloat64m1_t ACC2i = tmp2i;
vfloat64m1_t ACC3r = tmp3r;
vfloat64m1_t ACC3i = tmp3i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
B1r = B[bi+1*2+0];
B1i = B[bi+1*2+1];
B2r = B[bi+2*2+0];
B2i = B[bi+2*2+1];
B3r = B[bi+3*2+0];
B3i = B[bi+3*2+1];
bi += 4*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 4*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
m_top += 4;
}
if( M & 2 ) {
gvl = __riscv_vsetvl_e64m1(2);
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
double B1r = B[bi+1*2+0];
double B1i = B[bi+1*2+1];
double B2r = B[bi+2*2+0];
double B2i = B[bi+2*2+1];
double B3r = B[bi+3*2+0];
double B3i = B[bi+3*2+1];
bi += 4*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 2*2;
// 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
// leaving 22 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
vfloat64m1_t ACC1r = tmp1r;
vfloat64m1_t ACC1i = tmp1i;
vfloat64m1_t ACC2r = tmp2r;
vfloat64m1_t ACC2i = tmp2i;
vfloat64m1_t ACC3r = tmp3r;
vfloat64m1_t ACC3i = tmp3i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
B1r = B[bi+1*2+0];
B1i = B[bi+1*2+1];
B2r = B[bi+2*2+0];
B2i = B[bi+2*2+1];
B3r = B[bi+3*2+0];
B3i = B[bi+3*2+1];
bi += 4*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 2*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
m_top += 2;
}
if( M & 1 ) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
double result4 = 0;
double result5 = 0;
double result6 = 0;
double result7 = 0;
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
for(BLASLONG k=0; k<K; k++) {
result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
ai+=1*2;
bi+=4*2;
}
BLASLONG ci=n_top*ldc+m_top;
double Cr, Ci;
Cr = C[(ci+0*ldc+0)*2+0];
Ci = C[(ci+0*ldc+0)*2+1];
Cr += result0*alphar;
Ci += result1*alphar;
Cr -= result1*alphai;
Ci += result0*alphai;
C[(ci+0*ldc+0)*2+0] = Cr;
C[(ci+0*ldc+0)*2+1] = Ci;
Cr = C[(ci+1*ldc+0)*2+0];
Ci = C[(ci+1*ldc+0)*2+1];
Cr += result2*alphar;
Ci += result3*alphar;
Cr -= result3*alphai;
Ci += result2*alphai;
C[(ci+1*ldc+0)*2+0] = Cr;
C[(ci+1*ldc+0)*2+1] = Ci;
Cr = C[(ci+2*ldc+0)*2+0];
Ci = C[(ci+2*ldc+0)*2+1];
Cr += result4*alphar;
Ci += result5*alphar;
Cr -= result5*alphai;
Ci += result4*alphai;
C[(ci+2*ldc+0)*2+0] = Cr;
C[(ci+2*ldc+0)*2+1] = Ci;
Cr = C[(ci+3*ldc+0)*2+0];
Ci = C[(ci+3*ldc+0)*2+1];
Cr += result6*alphar;
Ci += result7*alphar;
Cr -= result7*alphai;
Ci += result6*alphai;
C[(ci+3*ldc+0)*2+0] = Cr;
C[(ci+3*ldc+0)*2+1] = Ci;
m_top+=1;
}
n_top += 4;
}
// -- tails for N=2
if( N & 2 ) {
gvl = __riscv_vsetvl_e64m1(4);
m_top = 0;
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
double B1r = B[bi+1*2+0];
double B1i = B[bi+1*2+1];
bi += 2*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 8*2;
// 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
// leaving 20 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
vfloat64m1_t ACC1r = tmp1r;
vfloat64m1_t ACC1i = tmp1i;
vfloat64m1_t ACC2r = tmp2r;
vfloat64m1_t ACC2i = tmp2i;
vfloat64m1_t ACC3r = tmp3r;
vfloat64m1_t ACC3i = tmp3i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
B1r = B[bi+1*2+0];
B1i = B[bi+1*2+1];
bi += 2*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 8*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += gvl;
vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*1;
vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += gvl;
vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
ci += gvl;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
ci += ldc-gvl*1;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
ci += gvl;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
m_top += 8;
}
if( M & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
double B1r = B[bi+1*2+0];
double B1i = B[bi+1*2+1];
bi += 2*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 4*2;
// 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
// leaving 26 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
vfloat64m1_t ACC1r = tmp1r;
vfloat64m1_t ACC1i = tmp1i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
B1r = B[bi+1*2+0];
B1i = B[bi+1*2+1];
bi += 2*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 4*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
m_top += 4;
}
if( M & 2 ) {
gvl = __riscv_vsetvl_e64m1(2);
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
double B1r = B[bi+1*2+0];
double B1i = B[bi+1*2+1];
bi += 2*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 2*2;
// 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
// leaving 26 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
vfloat64m1_t ACC1r = tmp1r;
vfloat64m1_t ACC1i = tmp1i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
B1r = B[bi+1*2+0];
B1i = B[bi+1*2+1];
bi += 2*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 2*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += ldc-gvl*0;
vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
ci += ldc-gvl*0;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
m_top += 2;
}
if( M & 1 ) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
for(BLASLONG k=0; k<K; k++) {
result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
ai+=1*2;
bi+=2*2;
}
BLASLONG ci=n_top*ldc+m_top;
double Cr, Ci;
Cr = C[(ci+0*ldc+0)*2+0];
Ci = C[(ci+0*ldc+0)*2+1];
Cr += result0*alphar;
Ci += result1*alphar;
Cr -= result1*alphai;
Ci += result0*alphai;
C[(ci+0*ldc+0)*2+0] = Cr;
C[(ci+0*ldc+0)*2+1] = Ci;
Cr = C[(ci+1*ldc+0)*2+0];
Ci = C[(ci+1*ldc+0)*2+1];
Cr += result2*alphar;
Ci += result3*alphar;
Cr -= result3*alphai;
Ci += result2*alphai;
C[(ci+1*ldc+0)*2+0] = Cr;
C[(ci+1*ldc+0)*2+1] = Ci;
m_top+=1;
}
n_top += 2;
}
// -- tails for N=1
if( N & 1 ) {
gvl = __riscv_vsetvl_e64m1(4);
m_top = 0;
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
bi += 1*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 8*2;
// 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
// leaving 24 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
vfloat64m1_t ACC1r = tmp1r;
vfloat64m1_t ACC1i = tmp1i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
bi += 1*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 8*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
ci += gvl;
vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
ci += gvl;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
m_top += 8;
}
if( M & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
bi += 1*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 4*2;
// 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
// leaving 28 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
bi += 1*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 4*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
m_top += 4;
}
if( M & 2 ) {
gvl = __riscv_vsetvl_e64m1(2);
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
double B0r = B[bi+0*2+0];
double B0i = B[bi+0*2+1];
bi += 1*2;
vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 2*2;
// 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
// leaving 28 vector registers for temporaries
vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
vfloat64m1_t ACC0r = tmp0r;
vfloat64m1_t ACC0i = tmp0i;
for(BLASLONG k=1; k<K; k++) {
B0r = B[bi+0*2+0];
B0i = B[bi+0*2+1];
bi += 1*2;
A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
ai += 2*2;
tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
ci=n_top*ldc+m_top;
__riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
__riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
m_top += 2;
}
if( M & 1 ) {
double result0 = 0;
double result1 = 0;
BLASLONG ai=m_top*K*2;
BLASLONG bi=n_top*K*2;
for(BLASLONG k=0; k<K; k++) {
result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
ai+=1*2;
bi+=1*2;
}
BLASLONG ci=n_top*ldc+m_top;
double Cr, Ci;
Cr = C[(ci+0*ldc+0)*2+0];
Ci = C[(ci+0*ldc+0)*2+1];
Cr += result0*alphar;
Ci += result1*alphar;
Cr -= result1*alphai;
Ci += result0*alphai;
C[(ci+0*ldc+0)*2+0] = Cr;
C[(ci+0*ldc+0)*2+1] = Ci;
m_top+=1;
}
n_top += 1;
}
return 0;
}