997 lines
42 KiB
C
997 lines
42 KiB
C
/*
|
|
|
|
AUTOGENERATED KERNEL
|
|
Script: ./kernel/riscv64/generate_kernel.py
|
|
Settings:
|
|
LMUL=2
|
|
M=8
|
|
M_tail_scalar_from=2
|
|
N=4
|
|
__riscv_='__riscv_'
|
|
complex=True
|
|
conjugate=False
|
|
cpu='zvl128b'
|
|
force_acc_double=False
|
|
index_type='BLASLONG'
|
|
op='gemm'
|
|
param_precision='float'
|
|
reg_width_bits=128
|
|
tail_policy=''
|
|
trace=False
|
|
|
|
Derived:
|
|
ELEN_ACC=32
|
|
ELEN_PARAM=32
|
|
LMUL_ACC=2
|
|
VFMACC='__riscv_vfmacc_vf_f32m2'
|
|
VFMUL='__riscv_vfmul_vf_f32m2'
|
|
VLEV='__riscv_vle32_v_f32m2'
|
|
VLSEV='__riscv_vlse32_v_f32m2'
|
|
VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
|
|
VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
|
|
VSETVL='__riscv_vsetvl_e32m2'
|
|
VSEV='__riscv_vse32_v_f32m2'
|
|
VSSEV='__riscv_vsse32_v_f32m2'
|
|
acc_vector_t='vfloat32m2_t'
|
|
output='cgemm_kernel_8x4_zvl128b.c'
|
|
param_scalar_t='float'
|
|
param_vector_t='vfloat32m2_t'
|
|
|
|
*/
|
|
|
|
#include "common.h"
|
|
|
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
#define S0 1
|
|
#define S1 -1
|
|
#define S2 1
|
|
#define S3 1
|
|
#define VFMACC_RR __riscv_vfmsac
|
|
#define VFMACC_RI __riscv_vfmacc
|
|
#endif
|
|
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
#define S0 1
|
|
#define S1 1
|
|
#define S2 1
|
|
#define S3 -1
|
|
#define VFMACC_RR __riscv_vfmacc
|
|
#define VFMACC_RI __riscv_vfmsac
|
|
#endif
|
|
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
#define S0 1
|
|
#define S1 1
|
|
#define S2 -1
|
|
#define S3 1
|
|
#define VFMACC_RR __riscv_vfmacc
|
|
#define VFMACC_RI __riscv_vfnmsac
|
|
#endif
|
|
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
|
#define S0 1
|
|
#define S1 -1
|
|
#define S2 -1
|
|
#define S3 -1
|
|
#define VFMACC_RR __riscv_vfmsac
|
|
#define VFMACC_RI __riscv_vfnmacc
|
|
#endif
|
|
|
|
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
|
|
|
|
{
|
|
BLASLONG gvl = 0;
|
|
BLASLONG m_top = 0;
|
|
BLASLONG n_top = 0;
|
|
|
|
// -- MAIN PASS
|
|
|
|
for (BLASLONG j = 0; j < N / 4; j += 1) {
|
|
m_top = 0;
|
|
BLASLONG gvl = __riscv_vsetvl_e32m2(8);
|
|
|
|
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
float B0r = B[bi + 0 * 2 + 0];
|
|
float B0i = B[bi + 0 * 2 + 1];
|
|
float B1r = B[bi + 1 * 2 + 0];
|
|
float B1i = B[bi + 1 * 2 + 1];
|
|
float B2r = B[bi + 2 * 2 + 0];
|
|
float B2i = B[bi + 2 * 2 + 1];
|
|
float B3r = B[bi + 3 * 2 + 0];
|
|
float B3i = B[bi + 3 * 2 + 1];
|
|
bi += 4 * 2;
|
|
|
|
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 8 * 2;
|
|
|
|
// 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
|
|
// leaving 6 vector registers for temporaries
|
|
// performing 2 operations between reuses of temporaries
|
|
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
vfloat32m2_t ACC0r = tmp0r;
|
|
vfloat32m2_t ACC0i = tmp0i;
|
|
vfloat32m2_t ACC1r = tmp1r;
|
|
vfloat32m2_t ACC1i = tmp1i;
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
|
vfloat32m2_t ACC2r = tmp0r;
|
|
vfloat32m2_t ACC2i = tmp0i;
|
|
vfloat32m2_t ACC3r = tmp1r;
|
|
vfloat32m2_t ACC3i = tmp1i;
|
|
|
|
for (BLASLONG k = 1; k < K; k++) {
|
|
B0r = B[bi + 0 * 2 + 0];
|
|
B0i = B[bi + 0 * 2 + 1];
|
|
B1r = B[bi + 1 * 2 + 0];
|
|
B1i = B[bi + 1 * 2 + 1];
|
|
B2r = B[bi + 2 * 2 + 0];
|
|
B2i = B[bi + 2 * 2 + 1];
|
|
B3r = B[bi + 3 * 2 + 0];
|
|
B3i = B[bi + 3 * 2 + 1];
|
|
bi += 4 * 2;
|
|
|
|
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 8 * 2;
|
|
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
|
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
|
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
|
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
|
ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
|
|
ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
|
|
ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
|
|
ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
|
|
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
|
|
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
|
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
|
C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
|
|
C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
|
|
C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
|
|
C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
|
|
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
|
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
|
C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
|
|
C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
|
|
C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
|
|
C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
|
|
|
|
ci = n_top * ldc + m_top;
|
|
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
|
|
|
|
m_top += 8;
|
|
}
|
|
|
|
// -- tails for main pass
|
|
|
|
if (M & 4) {
|
|
gvl = __riscv_vsetvl_e32m2(4);
|
|
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
float B0r = B[bi + 0 * 2 + 0];
|
|
float B0i = B[bi + 0 * 2 + 1];
|
|
float B1r = B[bi + 1 * 2 + 0];
|
|
float B1i = B[bi + 1 * 2 + 1];
|
|
float B2r = B[bi + 2 * 2 + 0];
|
|
float B2i = B[bi + 2 * 2 + 1];
|
|
float B3r = B[bi + 3 * 2 + 0];
|
|
float B3i = B[bi + 3 * 2 + 1];
|
|
bi += 4 * 2;
|
|
|
|
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 4 * 2;
|
|
|
|
// 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
|
|
// leaving 6 vector registers for temporaries
|
|
// performing 2 operations between reuses of temporaries
|
|
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
vfloat32m2_t ACC0r = tmp0r;
|
|
vfloat32m2_t ACC0i = tmp0i;
|
|
vfloat32m2_t ACC1r = tmp1r;
|
|
vfloat32m2_t ACC1i = tmp1i;
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
|
vfloat32m2_t ACC2r = tmp0r;
|
|
vfloat32m2_t ACC2i = tmp0i;
|
|
vfloat32m2_t ACC3r = tmp1r;
|
|
vfloat32m2_t ACC3i = tmp1i;
|
|
|
|
for (BLASLONG k = 1; k < K; k++) {
|
|
B0r = B[bi + 0 * 2 + 0];
|
|
B0i = B[bi + 0 * 2 + 1];
|
|
B1r = B[bi + 1 * 2 + 0];
|
|
B1i = B[bi + 1 * 2 + 1];
|
|
B2r = B[bi + 2 * 2 + 0];
|
|
B2i = B[bi + 2 * 2 + 1];
|
|
B3r = B[bi + 3 * 2 + 0];
|
|
B3i = B[bi + 3 * 2 + 1];
|
|
bi += 4 * 2;
|
|
|
|
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 4 * 2;
|
|
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
|
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
|
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
|
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
|
|
ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
|
|
ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
|
|
ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
|
|
ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
|
|
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
|
|
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
|
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
|
C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
|
|
C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
|
|
C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
|
|
C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
|
|
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
|
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
|
C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
|
|
C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
|
|
C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
|
|
C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
|
|
|
|
ci = n_top * ldc + m_top;
|
|
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
|
|
|
|
m_top += 4;
|
|
}
|
|
|
|
if (M & 2) {
|
|
float result0 = 0;
|
|
float result1 = 0;
|
|
float result2 = 0;
|
|
float result3 = 0;
|
|
float result4 = 0;
|
|
float result5 = 0;
|
|
float result6 = 0;
|
|
float result7 = 0;
|
|
float result8 = 0;
|
|
float result9 = 0;
|
|
float result10 = 0;
|
|
float result11 = 0;
|
|
float result12 = 0;
|
|
float result13 = 0;
|
|
float result14 = 0;
|
|
float result15 = 0;
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
|
|
for (BLASLONG k = 0; k < K; k++) {
|
|
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
|
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
|
result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
|
|
result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
|
|
result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
|
result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
|
result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
|
|
result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
|
|
result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
|
|
result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
|
|
result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
|
|
result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
|
|
result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
|
|
result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
|
|
result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
|
|
result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
|
|
ai += 2 * 2;
|
|
bi += 4 * 2;
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
float Cr, Ci;
|
|
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
|
Cr += result0 * alphar;
|
|
Ci += result1 * alphar;
|
|
Cr -= result1 * alphai;
|
|
Ci += result0 * alphai;
|
|
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
|
|
Cr += result2 * alphar;
|
|
Ci += result3 * alphar;
|
|
Cr -= result3 * alphai;
|
|
Ci += result2 * alphai;
|
|
C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
|
Cr += result4 * alphar;
|
|
Ci += result5 * alphar;
|
|
Cr -= result5 * alphai;
|
|
Ci += result4 * alphai;
|
|
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
|
|
Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
|
|
Cr += result6 * alphar;
|
|
Ci += result7 * alphar;
|
|
Cr -= result7 * alphai;
|
|
Ci += result6 * alphai;
|
|
C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
|
|
C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
|
|
Cr += result8 * alphar;
|
|
Ci += result9 * alphar;
|
|
Cr -= result9 * alphai;
|
|
Ci += result8 * alphai;
|
|
C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
|
|
Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
|
|
Cr += result10 * alphar;
|
|
Ci += result11 * alphar;
|
|
Cr -= result11 * alphai;
|
|
Ci += result10 * alphai;
|
|
C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
|
|
C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
|
|
Cr += result12 * alphar;
|
|
Ci += result13 * alphar;
|
|
Cr -= result13 * alphai;
|
|
Ci += result12 * alphai;
|
|
C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
|
|
Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
|
|
Cr += result14 * alphar;
|
|
Ci += result15 * alphar;
|
|
Cr -= result15 * alphai;
|
|
Ci += result14 * alphai;
|
|
C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
|
|
C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
|
|
m_top += 2;
|
|
}
|
|
|
|
if (M & 1) {
|
|
float result0 = 0;
|
|
float result1 = 0;
|
|
float result2 = 0;
|
|
float result3 = 0;
|
|
float result4 = 0;
|
|
float result5 = 0;
|
|
float result6 = 0;
|
|
float result7 = 0;
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
|
|
for (BLASLONG k = 0; k < K; k++) {
|
|
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
|
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
|
result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
|
result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
|
result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
|
|
result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
|
|
result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
|
|
result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
|
|
ai += 1 * 2;
|
|
bi += 4 * 2;
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
float Cr, Ci;
|
|
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
|
Cr += result0 * alphar;
|
|
Ci += result1 * alphar;
|
|
Cr -= result1 * alphai;
|
|
Ci += result0 * alphai;
|
|
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
|
Cr += result2 * alphar;
|
|
Ci += result3 * alphar;
|
|
Cr -= result3 * alphai;
|
|
Ci += result2 * alphai;
|
|
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
|
|
Cr += result4 * alphar;
|
|
Ci += result5 * alphar;
|
|
Cr -= result5 * alphai;
|
|
Ci += result4 * alphai;
|
|
C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
|
|
Cr += result6 * alphar;
|
|
Ci += result7 * alphar;
|
|
Cr -= result7 * alphai;
|
|
Ci += result6 * alphai;
|
|
C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
|
|
m_top += 1;
|
|
}
|
|
|
|
n_top += 4;
|
|
}
|
|
|
|
// -- tails for N=2
|
|
|
|
if (N & 2) {
|
|
gvl = __riscv_vsetvl_e32m2(8);
|
|
m_top = 0;
|
|
|
|
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
float B0r = B[bi + 0 * 2 + 0];
|
|
float B0i = B[bi + 0 * 2 + 1];
|
|
float B1r = B[bi + 1 * 2 + 0];
|
|
float B1i = B[bi + 1 * 2 + 1];
|
|
bi += 2 * 2;
|
|
|
|
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 8 * 2;
|
|
|
|
// 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
|
|
// leaving 10 vector registers for temporaries
|
|
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
vfloat32m2_t ACC0r = tmp0r;
|
|
vfloat32m2_t ACC0i = tmp0i;
|
|
vfloat32m2_t ACC1r = tmp1r;
|
|
vfloat32m2_t ACC1i = tmp1i;
|
|
|
|
for (BLASLONG k = 1; k < K; k++) {
|
|
B0r = B[bi + 0 * 2 + 0];
|
|
B0i = B[bi + 0 * 2 + 1];
|
|
B1r = B[bi + 1 * 2 + 0];
|
|
B1i = B[bi + 1 * 2 + 1];
|
|
bi += 2 * 2;
|
|
|
|
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 8 * 2;
|
|
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
|
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
|
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
|
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
|
|
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
|
|
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
|
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
|
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
|
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
|
|
|
ci = n_top * ldc + m_top;
|
|
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
|
|
|
m_top += 8;
|
|
}
|
|
|
|
if (M & 4) {
|
|
gvl = __riscv_vsetvl_e32m2(4);
|
|
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
float B0r = B[bi + 0 * 2 + 0];
|
|
float B0i = B[bi + 0 * 2 + 1];
|
|
float B1r = B[bi + 1 * 2 + 0];
|
|
float B1i = B[bi + 1 * 2 + 1];
|
|
bi += 2 * 2;
|
|
|
|
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 4 * 2;
|
|
|
|
// 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
|
|
// leaving 10 vector registers for temporaries
|
|
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
vfloat32m2_t ACC0r = tmp0r;
|
|
vfloat32m2_t ACC0i = tmp0i;
|
|
vfloat32m2_t ACC1r = tmp1r;
|
|
vfloat32m2_t ACC1i = tmp1i;
|
|
|
|
for (BLASLONG k = 1; k < K; k++) {
|
|
B0r = B[bi + 0 * 2 + 0];
|
|
B0i = B[bi + 0 * 2 + 1];
|
|
B1r = B[bi + 1 * 2 + 0];
|
|
B1i = B[bi + 1 * 2 + 1];
|
|
bi += 2 * 2;
|
|
|
|
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 4 * 2;
|
|
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
|
|
tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
|
|
tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
|
|
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
|
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
|
ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
|
|
ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
|
|
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ci += ldc - gvl * 0;
|
|
vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
|
|
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
|
C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
|
|
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
|
C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
|
|
C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
|
|
|
|
ci = n_top * ldc + m_top;
|
|
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
|
ci += ldc - gvl * 0;
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
|
|
|
|
m_top += 4;
|
|
}
|
|
|
|
if (M & 2) {
|
|
float result0 = 0;
|
|
float result1 = 0;
|
|
float result2 = 0;
|
|
float result3 = 0;
|
|
float result4 = 0;
|
|
float result5 = 0;
|
|
float result6 = 0;
|
|
float result7 = 0;
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
|
|
for (BLASLONG k = 0; k < K; k++) {
|
|
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
|
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
|
result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
|
|
result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
|
|
result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
|
result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
|
result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
|
|
result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
|
|
ai += 2 * 2;
|
|
bi += 2 * 2;
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
float Cr, Ci;
|
|
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
|
Cr += result0 * alphar;
|
|
Ci += result1 * alphar;
|
|
Cr -= result1 * alphai;
|
|
Ci += result0 * alphai;
|
|
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
|
|
Cr += result2 * alphar;
|
|
Ci += result3 * alphar;
|
|
Cr -= result3 * alphai;
|
|
Ci += result2 * alphai;
|
|
C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
|
Cr += result4 * alphar;
|
|
Ci += result5 * alphar;
|
|
Cr -= result5 * alphai;
|
|
Ci += result4 * alphai;
|
|
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
|
|
Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
|
|
Cr += result6 * alphar;
|
|
Ci += result7 * alphar;
|
|
Cr -= result7 * alphai;
|
|
Ci += result6 * alphai;
|
|
C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
|
|
C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
|
|
m_top += 2;
|
|
}
|
|
|
|
if (M & 1) {
|
|
float result0 = 0;
|
|
float result1 = 0;
|
|
float result2 = 0;
|
|
float result3 = 0;
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
|
|
for (BLASLONG k = 0; k < K; k++) {
|
|
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
|
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
|
result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
|
|
result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
|
|
ai += 1 * 2;
|
|
bi += 2 * 2;
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
float Cr, Ci;
|
|
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
|
Cr += result0 * alphar;
|
|
Ci += result1 * alphar;
|
|
Cr -= result1 * alphai;
|
|
Ci += result0 * alphai;
|
|
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
|
|
Cr += result2 * alphar;
|
|
Ci += result3 * alphar;
|
|
Cr -= result3 * alphai;
|
|
Ci += result2 * alphai;
|
|
C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
|
|
m_top += 1;
|
|
}
|
|
|
|
n_top += 2;
|
|
}
|
|
|
|
// -- tails for N=1
|
|
|
|
if (N & 1) {
|
|
gvl = __riscv_vsetvl_e32m2(8);
|
|
m_top = 0;
|
|
|
|
for (BLASLONG i = 0; i < M / 8; i += 1) {
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
float B0r = B[bi + 0 * 2 + 0];
|
|
float B0i = B[bi + 0 * 2 + 1];
|
|
bi += 1 * 2;
|
|
|
|
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 8 * 2;
|
|
|
|
// 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
|
|
// leaving 12 vector registers for temporaries
|
|
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
vfloat32m2_t ACC0r = tmp0r;
|
|
vfloat32m2_t ACC0i = tmp0i;
|
|
|
|
for (BLASLONG k = 1; k < K; k++) {
|
|
B0r = B[bi + 0 * 2 + 0];
|
|
B0i = B[bi + 0 * 2 + 1];
|
|
bi += 1 * 2;
|
|
|
|
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 8 * 2;
|
|
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
|
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
|
|
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
|
|
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
|
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
|
|
|
ci = n_top * ldc + m_top;
|
|
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
|
|
|
m_top += 8;
|
|
}
|
|
|
|
if (M & 4) {
|
|
gvl = __riscv_vsetvl_e32m2(4);
|
|
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
float B0r = B[bi + 0 * 2 + 0];
|
|
float B0i = B[bi + 0 * 2 + 1];
|
|
bi += 1 * 2;
|
|
|
|
vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 4 * 2;
|
|
|
|
// 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
|
|
// leaving 12 vector registers for temporaries
|
|
vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
vfloat32m2_t ACC0r = tmp0r;
|
|
vfloat32m2_t ACC0i = tmp0i;
|
|
|
|
for (BLASLONG k = 1; k < K; k++) {
|
|
B0r = B[bi + 0 * 2 + 0];
|
|
B0i = B[bi + 0 * 2 + 1];
|
|
bi += 1 * 2;
|
|
|
|
A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
|
|
A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
ai += 4 * 2;
|
|
|
|
tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
|
|
tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
|
|
tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
|
|
tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
|
|
ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
|
|
ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
|
|
vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
|
|
vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
|
|
|
|
C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
|
|
C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
|
|
C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
|
|
|
|
ci = n_top * ldc + m_top;
|
|
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
|
|
__riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
|
|
|
|
m_top += 4;
|
|
}
|
|
|
|
if (M & 2) {
|
|
float result0 = 0;
|
|
float result1 = 0;
|
|
float result2 = 0;
|
|
float result3 = 0;
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
|
|
for (BLASLONG k = 0; k < K; k++) {
|
|
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
|
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
|
result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
|
|
result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
|
|
ai += 2 * 2;
|
|
bi += 1 * 2;
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
float Cr, Ci;
|
|
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
|
Cr += result0 * alphar;
|
|
Ci += result1 * alphar;
|
|
Cr -= result1 * alphai;
|
|
Ci += result0 * alphai;
|
|
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
|
Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
|
|
Cr += result2 * alphar;
|
|
Ci += result3 * alphar;
|
|
Cr -= result3 * alphai;
|
|
Ci += result2 * alphai;
|
|
C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
|
|
m_top += 2;
|
|
}
|
|
|
|
if (M & 1) {
|
|
float result0 = 0;
|
|
float result1 = 0;
|
|
BLASLONG ai = m_top * K * 2;
|
|
BLASLONG bi = n_top * K * 2;
|
|
|
|
for (BLASLONG k = 0; k < K; k++) {
|
|
result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
|
|
result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
|
|
ai += 1 * 2;
|
|
bi += 1 * 2;
|
|
}
|
|
|
|
BLASLONG ci = n_top * ldc + m_top;
|
|
float Cr, Ci;
|
|
Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
|
|
Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
|
|
Cr += result0 * alphar;
|
|
Ci += result1 * alphar;
|
|
Cr -= result1 * alphai;
|
|
Ci += result0 * alphai;
|
|
C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
|
|
C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
|
|
m_top += 1;
|
|
}
|
|
|
|
n_top += 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|