OpenBLAS/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c

661 lines
18 KiB
C

/*
AUTOGENERATED KERNEL
Script: ./kernel/riscv64/generate_kernel.py
Settings:
LMUL=4
M=8
M_tail_scalar_from=2
N=4
__riscv_='__riscv_'
complex=False
conjugate=False
cpu='zvl128b'
force_acc_double=False
index_type='BLASLONG'
op='trmm'
param_precision='double'
reg_width_bits=128
tail_policy=''
trace=False
Derived:
ELEN_ACC=64
ELEN_PARAM=64
LMUL_ACC=4
VFMACC='__riscv_vfmacc_vf_f64m4'
VFMUL='__riscv_vfmul_vf_f64m4'
VLEV='__riscv_vle64_v_f64m4'
VLSEV='__riscv_vlse64_v_f64m4'
VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
VSETVL='__riscv_vsetvl_e64m4'
VSEV='__riscv_vse64_v_f64m4'
VSSEV='__riscv_vsse64_v_f64m4'
acc_vector_t='vfloat64m4_t'
output='dtrmm_kernel_8x4_zvl128b.c'
param_scalar_t='double'
param_vector_t='vfloat64m4_t'
*/
#include "common.h"
#if defined(LEFT) != defined(TRANSA)
#define BACKWARDS
#endif
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
{
BLASLONG gvl = 0;
BLASLONG m_top = 0;
BLASLONG n_top = 0;
// -- MAIN PASS
for (BLASLONG j = 0; j < N / 4; j += 1) {
m_top = 0;
BLASLONG gvl = __riscv_vsetvl_e64m4(8);
for (BLASLONG i = 0; i < M / 8; i += 1) {
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 8;
bi += off * 4;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 8;
#else
pass_K = off + 4;
#endif
#endif
double B0 = B[bi + 0];
double B1 = B[bi + 1];
double B2 = B[bi + 2];
double B3 = B[bi + 3];
bi += 4;
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 8;
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
for (BLASLONG k = 1; k < pass_K; k++) {
B0 = B[bi + 0];
B1 = B[bi + 1];
B2 = B[bi + 2];
B3 = B[bi + 3];
bi += 4;
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 8;
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c2, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c3, gvl);
m_top += 8;
}
// -- tails for main pass
if (M & 4) {
gvl = __riscv_vsetvl_e64m4(4);
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 4;
bi += off * 4;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 4;
#else
pass_K = off + 4;
#endif
#endif
double B0 = B[bi + 0];
double B1 = B[bi + 1];
double B2 = B[bi + 2];
double B3 = B[bi + 3];
bi += 4;
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 4;
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
for (BLASLONG k = 1; k < pass_K; k++) {
B0 = B[bi + 0];
B1 = B[bi + 1];
B2 = B[bi + 2];
B3 = B[bi + 3];
bi += 4;
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 4;
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c2, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c3, gvl);
m_top += 4;
}
if (M & 2) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
double result4 = 0;
double result5 = 0;
double result6 = 0;
double result7 = 0;
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 2;
bi += off * 4;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 2;
#else
pass_K = off + 4;
#endif
#endif
for (BLASLONG k = 0; k < pass_K; k++) {
result0 += A[ai + 0] * B[bi + 0];
result1 += A[ai + 1] * B[bi + 0];
result2 += A[ai + 0] * B[bi + 1];
result3 += A[ai + 1] * B[bi + 1];
result4 += A[ai + 0] * B[bi + 2];
result5 += A[ai + 1] * B[bi + 2];
result6 += A[ai + 0] * B[bi + 3];
result7 += A[ai + 1] * B[bi + 3];
ai += 2;
bi += 4;
}
BLASLONG ci = n_top * ldc + m_top;
C[ci + 0 * ldc + 0] = alpha * result0;
C[ci + 0 * ldc + 1] = alpha * result1;
C[ci + 1 * ldc + 0] = alpha * result2;
C[ci + 1 * ldc + 1] = alpha * result3;
C[ci + 2 * ldc + 0] = alpha * result4;
C[ci + 2 * ldc + 1] = alpha * result5;
C[ci + 3 * ldc + 0] = alpha * result6;
C[ci + 3 * ldc + 1] = alpha * result7;
m_top += 2;
}
if (M & 1) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 1;
bi += off * 4;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 1;
#else
pass_K = off + 4;
#endif
#endif
for (BLASLONG k = 0; k < pass_K; k++) {
result0 += A[ai + 0] * B[bi + 0];
result1 += A[ai + 0] * B[bi + 1];
result2 += A[ai + 0] * B[bi + 2];
result3 += A[ai + 0] * B[bi + 3];
ai += 1;
bi += 4;
}
BLASLONG ci = n_top * ldc + m_top;
C[ci + 0 * ldc + 0] = alpha * result0;
C[ci + 1 * ldc + 0] = alpha * result1;
C[ci + 2 * ldc + 0] = alpha * result2;
C[ci + 3 * ldc + 0] = alpha * result3;
m_top += 1;
}
n_top += 4;
}
// -- tails for N=2
if (N & 2) {
gvl = __riscv_vsetvl_e64m4(8);
m_top = 0;
for (BLASLONG i = 0; i < M / 8; i += 1) {
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 8;
bi += off * 2;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 8;
#else
pass_K = off + 2;
#endif
#endif
double B0 = B[bi + 0];
double B1 = B[bi + 1];
bi += 2;
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 8;
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
for (BLASLONG k = 1; k < pass_K; k++) {
B0 = B[bi + 0];
B1 = B[bi + 1];
bi += 2;
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 8;
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
m_top += 8;
}
if (M & 4) {
gvl = __riscv_vsetvl_e64m4(4);
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 4;
bi += off * 2;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 4;
#else
pass_K = off + 2;
#endif
#endif
double B0 = B[bi + 0];
double B1 = B[bi + 1];
bi += 2;
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 4;
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
for (BLASLONG k = 1; k < pass_K; k++) {
B0 = B[bi + 0];
B1 = B[bi + 1];
bi += 2;
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 4;
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
ci += ldc - gvl * 0;
__riscv_vse64_v_f64m4(&C[ci], c1, gvl);
m_top += 4;
}
if (M & 2) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 2;
bi += off * 2;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 2;
#else
pass_K = off + 2;
#endif
#endif
for (BLASLONG k = 0; k < pass_K; k++) {
result0 += A[ai + 0] * B[bi + 0];
result1 += A[ai + 1] * B[bi + 0];
result2 += A[ai + 0] * B[bi + 1];
result3 += A[ai + 1] * B[bi + 1];
ai += 2;
bi += 2;
}
BLASLONG ci = n_top * ldc + m_top;
C[ci + 0 * ldc + 0] = alpha * result0;
C[ci + 0 * ldc + 1] = alpha * result1;
C[ci + 1 * ldc + 0] = alpha * result2;
C[ci + 1 * ldc + 1] = alpha * result3;
m_top += 2;
}
if (M & 1) {
double result0 = 0;
double result1 = 0;
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 1;
bi += off * 2;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 1;
#else
pass_K = off + 2;
#endif
#endif
for (BLASLONG k = 0; k < pass_K; k++) {
result0 += A[ai + 0] * B[bi + 0];
result1 += A[ai + 0] * B[bi + 1];
ai += 1;
bi += 2;
}
BLASLONG ci = n_top * ldc + m_top;
C[ci + 0 * ldc + 0] = alpha * result0;
C[ci + 1 * ldc + 0] = alpha * result1;
m_top += 1;
}
n_top += 2;
}
// -- tails for N=1
if (N & 1) {
gvl = __riscv_vsetvl_e64m4(8);
m_top = 0;
for (BLASLONG i = 0; i < M / 8; i += 1) {
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 8;
bi += off * 1;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 8;
#else
pass_K = off + 1;
#endif
#endif
double B0 = B[bi + 0];
bi += 1;
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 8;
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
for (BLASLONG k = 1; k < pass_K; k++) {
B0 = B[bi + 0];
bi += 1;
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 8;
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
m_top += 8;
}
if (M & 4) {
gvl = __riscv_vsetvl_e64m4(4);
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 4;
bi += off * 1;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 4;
#else
pass_K = off + 1;
#endif
#endif
double B0 = B[bi + 0];
bi += 1;
vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 4;
vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
for (BLASLONG k = 1; k < pass_K; k++) {
B0 = B[bi + 0];
bi += 1;
A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
ai += 4;
result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
}
BLASLONG ci = n_top * ldc + m_top;
vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
__riscv_vse64_v_f64m4(&C[ci], c0, gvl);
m_top += 4;
}
if (M & 2) {
double result0 = 0;
double result1 = 0;
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 2;
bi += off * 1;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 2;
#else
pass_K = off + 1;
#endif
#endif
for (BLASLONG k = 0; k < pass_K; k++) {
result0 += A[ai + 0] * B[bi + 0];
result1 += A[ai + 1] * B[bi + 0];
ai += 2;
bi += 1;
}
BLASLONG ci = n_top * ldc + m_top;
C[ci + 0 * ldc + 0] = alpha * result0;
C[ci + 0 * ldc + 1] = alpha * result1;
m_top += 2;
}
if (M & 1) {
double result0 = 0;
BLASLONG ai = m_top * K;
BLASLONG bi = n_top * K;
BLASLONG pass_K = K;
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * 1;
bi += off * 1;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + 1;
#else
pass_K = off + 1;
#endif
#endif
for (BLASLONG k = 0; k < pass_K; k++) {
result0 += A[ai + 0] * B[bi + 0];
ai += 1;
bi += 1;
}
BLASLONG ci = n_top * ldc + m_top;
C[ci + 0 * ldc + 0] = alpha * result0;
m_top += 1;
}
n_top += 1;
}
return 0;
}