Merge pull request #3893 from HellerZheng/develop

add riscv level3 C,Z kernel functions.
This commit is contained in:
Zhang Xianyi 2023-03-15 10:17:13 +08:00 committed by GitHub
commit 19f17c8bc6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 3355 additions and 2293 deletions

View File

@ -118,8 +118,8 @@ DGEMVTKERNEL = gemv_t_rvv.c
CGEMVTKERNEL = zgemv_t_rvv.c
ZGEMVTKERNEL = zgemv_t_rvv.c
CTRMMKERNEL = ztrmmkernel_2x2_rvv.c
ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c
CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c
ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c
# SGEMM_UNROLL_N set in params.h
ifeq ($(SGEMM_UNROLL_N), 8)
@ -168,17 +168,28 @@ DSYMMUCOPY_M = symm_ucopy_rvv_v1.c
DSYMMLCOPY_M = symm_lcopy_rvv_v1.c
endif
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
CGEMMKERNEL = zgemmkernel_rvv_v1x4.c
CGEMMINCOPY = zgemm_ncopy_rvv_v1.c
CGEMMITCOPY = zgemm_tcopy_rvv_v1.c
CGEMMONCOPY = zgemm_ncopy_4_rvv.c
CGEMMOTCOPY = zgemm_tcopy_4_rvv.c
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c
ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c
ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c
ZGEMMONCOPY = zgemm_ncopy_4_rvv.c
ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
@ -190,20 +201,25 @@ DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c
TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c
TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c
TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c
TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c
TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c
TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c
TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c
ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c
ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c
ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c
ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c
SSYMV_U_KERNEL = symv_U_rvv.c
SSYMV_L_KERNEL = symv_L_rvv.c
@ -214,6 +230,27 @@ CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c
ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c
CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c
CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c
ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c
ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c
CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c
CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c
ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c
ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c
ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c
ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c
CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c
CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c
CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c
CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c
LSAME_KERNEL = ../generic/lsame.c

View File

@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vint32m2_t
#define VID_V_UINT vid_v_i32m2
#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)

View File

@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#endif
@ -88,606 +91,107 @@ static FLOAT dm1 = -1.;
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT *pa, *pc;
FLOAT aa;
FLOAT* pc;
int i, j, k;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
FLOAT_V_T vb, vc;
size_t vl;
FLOAT_V_T va, vc;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
for (i = m - 1; i >= 0; i--) {
aa = *(a + i);
for (j = 0; j < n; j ++)
{
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b ++;
pc = c;
for (j = n; j > 0; j -= vl) {
vl = VSETVL(j);
vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
vb = VFMULVF_FLOAT(vb, aa, vl);
VSEV_FLOAT(b, vb, vl);
VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
b += vl;
pa = a;
pc = c + j * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLEV_FLOAT(pc, vl);
va = VLEV_FLOAT(pa, vl);
vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
VSEV_FLOAT(pc, vc, vl);
pa += vl;
pc += vl;
for (k = 0; k < i; k ++) {
vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
}
pc += vl * ldc;
}
a -= m;
b -= 2 * n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb0, bb1;
FLOAT *pa, *pc, *pc0, *pc1;
FLOAT *pb0, *pb1;
int i, j, k;
fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
size_t vl;
FLOAT_V_T va, vc0, vc1;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/2; j ++)
{
//bb = *(c + i + j * ldc);
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
//bb *= aa;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
//*b = bb;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
//*(c + i + j * ldc) = bb;
//b ++;
b += 2;
//pa = a + i + 1;
pc0 = c + j * ldc * 2;
pc1 = pc0 + ldc;
pa = a;
//pc = c + j * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
}
pc += ldc * (n/2) * 2;
if (n & 1)
{
pb0 = pc;
bb0 = (*pb0) * aa;
*b = bb0;
*pb0 = bb0;
b += 1;
pc0 = pc - i;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a -= m;
b -= 2 * n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb0, bb1, bb2, bb3;
FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3;
FLOAT *pb0, *pb1, *pb2, *pb3;
int i, j, k;
size_t vl;
FLOAT_V_T va, vc0, vc1, vc2, vc3;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/4; j ++)
{
pb0 = pc + j * ldc * 4;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pc0 = c + j * ldc * 4;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
}
pc += ldc * (n/4) * 4;
if (n & 2)
{
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pc0 = c + j * ldc * 2;
pc1 = pc0 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = (*pb0) * aa;
*b = bb0;
*pb0 = bb0;
b += 1;
pc0 = pc - i;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a -= m;
b -= 2 * n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
int i, j, k;
size_t vl;
FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/8; j ++)
{
pb0 = pc + j * ldc * 8;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
pb4 = pb3 + ldc;
pb5 = pb4 + ldc;
pb6 = pb5 + ldc;
pb7 = pb6 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
bb4 = (*pb4) * aa;
bb5 = (*pb5) * aa;
bb6 = (*pb6) * aa;
bb7 = (*pb7) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*(b+4) = bb4;
*(b+5) = bb5;
*(b+6) = bb6;
*(b+7) = bb7;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
*pb4 = bb4;
*pb5 = bb5;
*pb6 = bb6;
*pb7 = bb7;
b += 8;
pc0 = c + j * ldc * 8;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pc4 = pc3 + ldc;
pc5 = pc4 + ldc;
pc6 = pc5 + ldc;
pc7 = pc6 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
vc4 = VLEV_FLOAT(pc4, vl);
vc5 = VLEV_FLOAT(pc5, vl);
vc6 = VLEV_FLOAT(pc6, vl);
vc7 = VLEV_FLOAT(pc7, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
VSEV_FLOAT(pc4, vc4, vl);
VSEV_FLOAT(pc5, vc5, vl);
VSEV_FLOAT(pc6, vc6, vl);
VSEV_FLOAT(pc7, vc7, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
pc4 += vl;
pc5 += vl;
pc6 += vl;
pc7 += vl;
}
}
pc += ldc * (n/8) * 8;
if (n & 4)
{
pb0 = pc + j * ldc * 4;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pc0 = c + j * ldc * 4;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
pc += ldc * 4;
}
if (n & 2)
{
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pc0 = c + j * ldc * 2;
pc1 = pc0 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = (*pb0) * aa;
*b = bb0;
*pb0 = bb0;
b += 1;
pc0 = pc - i;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a -= m;
b -= 2 * n;
}
}
#else
static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
int i, j, k;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--) {
aa = *(a + i);
for (j = 0; j < n; j ++) {
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b ++;
for (k = 0; k < i; k ++){
*(c + k + j * ldc) -= bb * *(a + k);
}
}
a -= m;
b -= 2 * n;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
FLOAT aa1, aa2;
FLOAT *pc;
int i, j, k;
int i, j, k;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
ldc *= 2;
a += (m - 1) * m * 2;
b += (m - 1) * n * 2;
FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
size_t vl;
a += (m - 1) * m * 2;
b += (m - 1) * n * 2;
for (i = m - 1; i >= 0; i--) {
for (i = m - 1; i >= 0; i--) {
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
pc = c;
for (j = n; j > 0; j -= vl) {
vl = VSETVL(j);
VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = aa1 * bb2 - aa2 * bb1;
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
#endif
VSSEG2_FLOAT(b, vs1, vs2, vl);
VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl);
b += vl * 2;
*(b + 0) = cc1;
*(b + 1) = cc2;
*(c + i * 2 + 0 + j * ldc) = cc1;
*(c + i * 2 + 1 + j * ldc) = cc2;
b += 2;
for (k = 0; k < i; k ++){
for (k = 0; k < i; k ++) {
VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl);
#ifndef CONJ
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
#else
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
#else
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
#endif
}
VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl);
}
pc += vl * ldc * 2;
}
a -= m * 2;
b -= 4 * n;
}
a -= m * 2;
b -= 4 * n;
}
}
#endif

View File

@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#endif
@ -87,616 +90,101 @@ static FLOAT dm1 = -1.;
// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb;
FLOAT *pa, *pc;
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa;
FLOAT* pc;
int i, j, k;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
FLOAT_V_T vb, vc;
size_t vl;
FLOAT_V_T va, vc;
for (i = 0; i < m; i++)
{
for (i = 0; i < m; i++) {
aa = *(a + i);
for (j = 0; j < n; j ++)
{
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b++;
pa = a + i + 1;
pc = c + j * ldc + i + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLEV_FLOAT(pc, vl);
va = VLEV_FLOAT(pa, vl);
vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
VSEV_FLOAT(pc, vc, vl);
pa += vl;
pc += vl;
pc = c;
for (j = n; j > 0; j -= vl) {
vl = VSETVL(j);
vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
vb = VFMULVF_FLOAT(vb, aa, vl);
VSEV_FLOAT(b, vb, vl);
VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
b += vl;
for (k = i + 1; k < m; k++) {
vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
}
pc += vl * ldc;
}
a += m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb0, bb1;
FLOAT *pa, *pc, *pc0, *pc1;
FLOAT *pb0, *pb1;
int i, j, k;
size_t vl;
FLOAT_V_T va, vc0, vc1;
for (i = 0; i < m; i++)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/2; j ++)
{
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
}
pc += ldc * (n/2) * 2;
if (n & 1)
{
pb0 = pc;
bb0 = *(pb0);
bb0 *= aa;
*b = bb0;
*(c + i) = bb0;
b++;
pa = a + i + 1;
pc0 = pb0 + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a += m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb0, bb1, bb2, bb3;
FLOAT *pa, *pc;
FLOAT *pc0, *pc1, *pc2, *pc3;
FLOAT *pb0, *pb1, *pb2, *pb3;
int i, j, k;
size_t vl;
FLOAT_V_T va;
FLOAT_V_T vc0, vc1, vc2, vc3;
for (i = 0; i < m; i++)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/4; j ++)
{
pb0 = pc;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
}
pc += ldc * (n/4) * 4;
if (n & 2)
{
pb0 = pc;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = *(pb0);
bb0 *= aa;
*b = bb0;
*(c + i) = bb0;
b++;
pa = a + i + 1;
pc0 = pb0 + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a += m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
FLOAT *pa, *pc;
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
int i, j, k;
size_t vl;
FLOAT_V_T va;
FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
for (i = 0; i < m; i++)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/8; j ++)
{
pb0 = pc + j * ldc * 8;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
pb4 = pb3 + ldc;
pb5 = pb4 + ldc;
pb6 = pb5 + ldc;
pb7 = pb6 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
bb4 = (*pb4) * aa;
bb5 = (*pb5) * aa;
bb6 = (*pb6) * aa;
bb7 = (*pb7) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*(b+4) = bb4;
*(b+5) = bb5;
*(b+6) = bb6;
*(b+7) = bb7;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
*pb4 = bb4;
*pb5 = bb5;
*pb6 = bb6;
*pb7 = bb7;
b += 8;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pc4 = pc3 + ldc;
pc5 = pc4 + ldc;
pc6 = pc5 + ldc;
pc7 = pc6 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
vc4 = VLEV_FLOAT(pc4, vl);
vc5 = VLEV_FLOAT(pc5, vl);
vc6 = VLEV_FLOAT(pc6, vl);
vc7 = VLEV_FLOAT(pc7, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
VSEV_FLOAT(pc4, vc4, vl);
VSEV_FLOAT(pc5, vc5, vl);
VSEV_FLOAT(pc6, vc6, vl);
VSEV_FLOAT(pc7, vc7, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
pc4 += vl;
pc5 += vl;
pc6 += vl;
pc7 += vl;
}
}
pc += ldc * (n/8) * 8;
if (n & 4)
{
pb0 = pc;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
pc += ldc * 4;
}
if (n & 2)
{
pb0 = pc;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = *(pb0);
bb0 *= aa;
*b = bb0;
*(c + i) = bb0;
b++;
pa = a + i + 1;
pc0 = pb0 + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a += m;
}
}
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT aa1, aa2;
FLOAT *pc;
int i, j, k;
int i, j, k;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
for (i = 0; i < m; i++) {
FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
size_t vl;
aa = *(a + i);
ldc *= 2;
for (j = 0; j < n; j ++) {
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b ++;
for (k = i + 1; k < m; k ++){
*(c + k + j * ldc) -= bb * *(a + k);
}
}
a += m;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
int i, j, k;
ldc *= 2;
for (i = 0; i < m; i++) {
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
for (i = 0; i < m; i++) {
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
pc = c;
for (j = n; j > 0; j -= vl) {
vl = VSETVL(j);
VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = aa1 * bb2 - aa2 * bb1;
vs1 = VFMULVF_FLOAT(vb1, aa1, vl);
vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
vs2 = VFMULVF_FLOAT(vb2, aa1, vl);
vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
#endif
VSSEG2_FLOAT(b, vs1, vs2, vl);
VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl);
b += vl * 2;
*(b + 0) = cc1;
*(b + 1) = cc2;
*(c + i * 2 + 0 + j * ldc) = cc1;
*(c + i * 2 + 1 + j * ldc) = cc2;
b += 2;
for (k = i + 1; k < m; k ++){
for (k = i + 1; k < m; k++) {
VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl);
#ifndef CONJ
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
#else
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
#else
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
#endif
}
}
a += m * 2;
}
}
static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
FLOAT *pa, *pc;
int i, j, k;
size_t vl;
FLOAT_V_T va0, va1, vc0, vc1;
ldc *= 2;
for (i = 0; i < m; i++) {
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = aa1 * bb2 - aa2 * bb1;
#endif
*(b + 0) = cc1;
*(b + 1) = cc2;
*(c + i * 2 + 0 + j * ldc) = cc1;
*(c + i * 2 + 1 + j * ldc) = cc2;
b += 2;
pa = a + (i + 1) * 2;
pc = c + j * ldc + (i + 1) * 2;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
VLSEG2_FLOAT(&va0, &va1, pa, vl);
VLSEG2_FLOAT(&vc0, &vc1, pc, vl);
#ifndef CONJ
vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
vc0 = VFMACCVF_FLOAT(vc0, cc2, va1);
vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1);
vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
#else
vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1);
vc1 = VFMACCVF_FLOAT(vc1, cc1, va1);
vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
#endif
VSSEG2_FLOAT(pc, vc0, vc1, vl);
pa += vl * 2;
pc += vl * 2;
VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl);
}
pc += vl * ldc * 2;
}
}
a += m * 2;
}
a += m * 2;
}
}
#endif
@ -714,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
size_t vl = VSETVL_MAX;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
j = (n >> GEMM_UNROLL_N_SHIFT);

View File

@ -32,28 +32,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#endif
static FLOAT dm1 = -1.;
#ifdef CONJ
@ -86,569 +90,99 @@ static FLOAT dm1 = -1.;
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT *pb, *pc;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc;
for (i = 0; i < n; i++)
{
bb = *(b + i);
for (j = 0; j < m; j ++)
{
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
pb = b + i + 1;
pc = c + j + (i + 1) *ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLSEV_FLOAT(pc, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
VSSEV_FLOAT(pc, stride_ldc, vc, vl);
pb += vl;
pc ++;
}
}
b += n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pc0, *pc1;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1;
for (i = 0; i < n; i++)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/2; j ++)
{
pa0 = pc + j * 2;
pa1 = pc + j * 2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
}
pc += (m/2)*2;
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b + i + 1;
pc0 = pa0 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b += n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT bb;
FLOAT aa0, aa1, aa2, aa3;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3;
FLOAT *pc0, *pc1, *pc2, *pc3;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
FLOAT *pci, *pcj;
int i, j, k;
FLOAT_V_T va, vc;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3;
for (i = 0; i < n; i++) {
for (i = 0; i < n; i++)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/4; j ++)
{
pa0 = pc + j * 4;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
pc2 = pa2 + ldc;
pc3 = pa3 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
}
pc += (m/4)*4;
if (m & 2)
{
pa0 = pc;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b + i + 1;
pc0 = pa0 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
pci = c + i * ldc;
pcj = c;
for (j = m; j > 0; j -= vl) {
vl = VSETVL(j);
va = VLEV_FLOAT(pci, vl);
va = VFMULVF_FLOAT(va, bb, vl);
VSEV_FLOAT(a, va, vl);
VSEV_FLOAT(pci, va, vl);
a += vl;
pci += vl;
for (k = i + 1; k < n; k ++){
vc = VLEV_FLOAT(pcj + k * ldc, vl);
vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
VSEV_FLOAT(pcj + k * ldc, vc, vl);
}
pcj += vl;
}
b += n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT bb;
FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
FLOAT bb1, bb2;
FLOAT *pci, *pcj;
int i, j, k;
FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
for (i = 0; i < n; i++)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/8; j ++)
{
pa0 = pc + j * 8;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
pa4 = pa3 + 1;
pa5 = pa4 + 1;
pa6 = pa5 + 1;
pa7 = pa6 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
aa4 = *pa4 * bb;
aa5 = *pa5 * bb;
aa6 = *pa6 * bb;
aa7 = *pa7 * bb;
for (i = 0; i < n; i++) {
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*pa4 = aa4;
*pa5 = aa5;
*pa6 = aa6;
*pa7 = aa7;
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
*(a + 4)= aa4;
*(a + 5)= aa5;
*(a + 6)= aa6;
*(a + 7)= aa7;
a += 8;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
pc2 = pa2 + ldc;
pc3 = pa3 + ldc;
pc4 = pa4 + ldc;
pc5 = pa5 + ldc;
pc6 = pa6 + ldc;
pc7 = pa7 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
pc4++;
pc5++;
pc6++;
pc7++;
}
}
pc += (m/8)*8;
if (m & 4)
{
pa0 = pc;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
pc2 = pa2 + ldc;
pc3 = pa3 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
pc += 4;
}
if (m & 2)
{
pa0 = pc;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b + i + 1;
pc0 = pa0 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b += n;
}
}
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
int i, j, k;
for (i = 0; i < n; i++) {
bb = *(b + i);
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
for (k = i + 1; k < n; k ++){
*(c + j + k * ldc) -= aa * *(b + k);
}
}
b += n;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
int i, j, k;
ldc *= 2;
for (i = 0; i < n; i++) {
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
for (j = 0; j < m; j ++) {
aa1 = *(c + j * 2 + 0 + i * ldc);
aa2 = *(c + j * 2 + 1 + i * ldc);
pci = c + i * ldc * 2;
pcj = c;
for (j = m; j > 0; j -= vl) {
vl = VSETVL(j);
VLSEG2_FLOAT(&va1, &va2, pci, vl);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
vs2 = VFMULVF_FLOAT(va1, bb2, vl);
vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl);
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = -aa1 * bb2 + aa2 * bb1;
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl);
vs2 = VFMULVF_FLOAT(va2, bb1, vl);
vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
#endif
VSSEG2_FLOAT(a, vs1, vs2, vl);
VSSEG2_FLOAT(pci, vs1, vs2, vl);
a += vl * 2;
pci += vl * 2;
*(a + 0) = cc1;
*(a + 1) = cc2;
*(c + j * 2 + 0 + i * ldc) = cc1;
*(c + j * 2 + 1 + i * ldc) = cc2;
a += 2;
for (k = i + 1; k < n; k ++){
for (k = i + 1; k < n; k ++){
VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl);
#ifndef CONJ
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
#else
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
#endif
}
VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl);
}
pcj += vl * 2;
}
b += n * 2;
}
b += n * 2;
}
}
#endif
@ -666,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
size_t vl = VSETVL_MAX;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
j = (n >> GEMM_UNROLL_N_SHIFT);

View File

@ -32,25 +32,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#endif
@ -86,497 +85,38 @@ static FLOAT dm1 = -1.;
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT *pb, *pc;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--) {
bb = *(b + i);
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
pb = b;
pc = c + j;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLSEV_FLOAT(pc, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
VSSEV_FLOAT(pc, stride_ldc, vc, vl);
pb += vl;
pc++;
}
}
b -= n;
a -= 2 * m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pc0, *pc1;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/2; j ++)
{
pa0 = pc + j * 2;
pa1 = pc + j * 2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b;
pc0 = c + j * 2;
pc1 = pc0 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
}
pc += (m/2)*2;
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b;
pc0 = pc - i * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b -= n;
a -= 2 * m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, aa2, aa3;
FLOAT bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3;
FLOAT *pc0, *pc1, *pc2, *pc3;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
FLOAT *pci, *pcj;
int i, j, k;
FLOAT_V_T va, vc;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--)
{
for (i = n - 1; i >= 0; i--) {
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/4; j ++)
{
pa0 = pc + j * 4;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b;
pc0 = c + j * 4;
pc1 = pc0 + 1;
pc2 = pc1 + 1;
pc3 = pc2 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
}
pc += (m/4)*4;
if (m & 2)
{
pa0 = pc + j * 2;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b;
pc0 = c + j * 4;
pc1 = pc0 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b;
pc0 = pc - i * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b -= n;
a -= 2 * m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
FLOAT bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/8; j ++)
{
pa0 = pc + j * 8;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
pa4 = pa3 + 1;
pa5 = pa4 + 1;
pa6 = pa5 + 1;
pa7 = pa6 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
aa4 = *pa4 * bb;
aa5 = *pa5 * bb;
aa6 = *pa6 * bb;
aa7 = *pa7 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*pa4 = aa4;
*pa5 = aa5;
*pa6 = aa6;
*pa7 = aa7;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
*(a + 4)= aa4;
*(a + 5)= aa5;
*(a + 6)= aa6;
*(a + 7)= aa7;
a += 8;
pb = b;
pc0 = c + j * 8;
pc1 = pc0 + 1;
pc2 = pc1 + 1;
pc3 = pc2 + 1;
pc4 = pc3 + 1;
pc5 = pc4 + 1;
pc6 = pc5 + 1;
pc7 = pc6 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
pc4++;
pc5++;
pc6++;
pc7++;
}
}
pc += (m/8)*8;
if (m & 4)
{
pa0 = pc;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b;
pc0 = pc - i * ldc;
pc1 = pc0 + 1;
pc2 = pc1 + 1;
pc3 = pc2 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
pc += 4;
}
if (m & 2)
{
pa0 = pc;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b;
pc0 = pc - i * ldc;
pc1 = pc0 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b;
pc0 = pc - i * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
pci = c + i * ldc;
pcj = c;
for (j = m; j > 0; j -= vl) {
vl = VSETVL(j);
va = VLEV_FLOAT(pci, vl);
va = VFMULVF_FLOAT(va, bb, vl);
VSEV_FLOAT(a, va, vl);
VSEV_FLOAT(pci, va, vl);
a += vl;
pci += vl;
for (k = 0; k < i; k ++){
vc = VLEV_FLOAT(pcj + k * ldc, vl);
vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
VSEV_FLOAT(pcj + k * ldc, vc, vl);
}
pcj += vl;
}
b -= n;
a -= 2 * m;
@ -587,92 +127,65 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT bb1, bb2;
int i, j, k;
FLOAT *pci, *pcj;
a += (n - 1) * m;
b += (n - 1) * n;
int i, j, k;
for (i = n - 1; i >= 0; i--) {
FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
bb = *(b + i);
size_t vl;
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
a += (n - 1) * m * 2;
b += (n - 1) * n * 2;
for (k = 0; k < i; k ++){
*(c + j + k * ldc) -= aa * *(b + k);
}
for (i = n - 1; i >= 0; i--) {
}
b -= n;
a -= 2 * m;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
int i, j, k;
ldc *= 2;
a += (n - 1) * m * 2;
b += (n - 1) * n * 2;
for (i = n - 1; i >= 0; i--) {
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
for (j = 0; j < m; j ++) {
aa1 = *(c + j * 2 + 0 + i * ldc);
aa2 = *(c + j * 2 + 1 + i * ldc);
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
pci = c + i * ldc * 2;
pcj = c;
for (j = m; j > 0; j -= vl) {
vl = VSETVL(j);
VLSEG2_FLOAT(&va1, &va2, pci, vl);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
vs2 = VFMULVF_FLOAT(va1, bb2, vl);
vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl);
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = - aa1 * bb2 + aa2 * bb1;
vs1 = VFMULVF_FLOAT(va1, bb1, vl);
vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl);
vs2 = VFMULVF_FLOAT(va2, bb1, vl);
vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
#endif
VSSEG2_FLOAT(a, vs1, vs2, vl);
VSSEG2_FLOAT(pci, vs1, vs2, vl);
a += vl * 2;
pci += vl * 2;
*(a + 0) = cc1;
*(a + 1) = cc2;
*(c + j * 2 + 0 + i * ldc) = cc1;
*(c + j * 2 + 1 + i * ldc) = cc2;
a += 2;
for (k = 0; k < i; k ++){
for (k = 0; k < i; k ++){
VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl);
#ifndef CONJ
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
#else
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
#endif
}
VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl);
}
pcj += vl * 2;
}
b -= n * 2;
a -= 4 * m;
}
b -= n * 2;
a -= 4 * m;
}
}
#endif
@ -689,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
size_t vl = VSETVL_MAX;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
kk = n - offset;
c += n * ldc * COMPSIZE;

View File

@ -0,0 +1,121 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define VLSEG2_FLOAT vlseg2e32_v_f32m1
#define VSSEG2_FLOAT vsseg2e32_v_f32m1
#define VSSEG4_FLOAT vsseg4e32_v_f32m1
#define VSSEG8_FLOAT vsseg8e32_v_f32m1
#else
#define VSETVL(n) vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define VLSEG2_FLOAT vlseg2e64_v_f64m1
#define VSSEG2_FLOAT vsseg2e64_v_f64m1
#define VSSEG4_FLOAT vsseg4e64_v_f64m1
#define VSSEG8_FLOAT vsseg8e64_v_f64m1
#endif
// Optimizes the implementation in ../generic/zgemm_ncopy_4.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;
FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
FLOAT *boffset;
FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42;
size_t vl;
aoffset = a;
boffset = b;
lda *= 2;
for (j = (n >> 2); j > 0; j--) {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
for (i = m; i > 0; i -= vl) {
vl = VSETVL(i);
VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
VLSEG2_FLOAT(&v21, &v22, aoffset2, vl);
VLSEG2_FLOAT(&v31, &v32, aoffset3, vl);
VLSEG2_FLOAT(&v41, &v42, aoffset4, vl);
VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl);
aoffset1 += vl * 2;
aoffset2 += vl * 2;
aoffset3 += vl * 2;
aoffset4 += vl * 2;
boffset += vl * 8;
}
}
if (n & 2) {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
for (i = m; i > 0; i -= vl) {
vl = VSETVL(i);
VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
VLSEG2_FLOAT(&v21, &v22, aoffset2, vl);
VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl);
aoffset1 += vl * 2;
aoffset2 += vl * 2;
boffset += vl * 4;
}
}
if (n & 1) {
aoffset1 = aoffset;
aoffset += lda;
for (i = m; i > 0; i -= vl) {
vl = VSETVL(i);
VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
VSSEG2_FLOAT(boffset, v11, v12, vl);
aoffset1 += vl * 2;
boffset += vl * 2;
}
}
return 0;
}

View File

@ -0,0 +1,74 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;
FLOAT *a_offset;
FLOAT *a_offset1;
FLOAT *b_offset;
FLOAT_V_T v0, v1;
size_t vl;
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
a_offset = a;
b_offset = b;
for(j = n; j > 0; j -= vl) {
vl = VSETVL(j);
a_offset1 = a_offset;
a_offset += vl * lda * 2;
for(i = m; i > 0; i--) {
VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl);
VSSEG2_FLOAT(b_offset, v0, v1, vl);
a_offset1 += 2;
b_offset += vl * 2;
}
}
return 0;
}

View File

@ -0,0 +1,181 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m1
#define VSEV_FLOAT vse32_v_f32m1
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1
#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1
#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1
#define VSSEG2_FLOAT vsseg2e32_v_f32m1
#define VSSEG4_FLOAT vsseg4e32_v_f32m1
#define VSSEG8_FLOAT vsseg8e32_v_f32m1
#else
#define VSETVL(n) vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m1
#define VSEV_FLOAT vse64_v_f64m1
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1
#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1
#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1
#define VSSEG2_FLOAT vsseg2e64_v_f64m1
#define VSSEG4_FLOAT vsseg4e64_v_f64m1
#define VSSEG8_FLOAT vsseg8e64_v_f64m1
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;
IFLOAT *aoffset;
IFLOAT *aoffset1;
IFLOAT *boffset, *boffset1, *boffset2, *boffset3;
FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
size_t vl;
//fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
aoffset = a;
boffset = b;
boffset2 = b + 2 * m * (n & ~3);
boffset3 = b + 2 * m * (n & ~1);
for(j = (m >> 2); j > 0; j--) {
aoffset1 = aoffset;
aoffset += 8 * lda;
boffset1 = boffset;
boffset += 32;
for(i = (n >> 2); i > 0; i--) {
vl = 4;
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
aoffset1 += 8;
boffset1 += m * 8;
}
if (n & 2) {
vl = 4;
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
aoffset1 += 4;
boffset2 += 16;
}
if (n & 1) {
vl = 4;
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
aoffset1 += 2;
boffset3 += 8;
}
}
if (m & 2) {
aoffset1 = aoffset;
aoffset += 4 * lda;
boffset1 = boffset;
boffset += 16;
for(i = (n >> 2); i > 0; i--) {
vl = 2;
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
aoffset1 += 8;
boffset1 += m * 8;
}
if (n & 2) {
vl = 2;
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
aoffset1 += 4;
boffset2 += 8;
}
if (n & 1) {
vl = 2;
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
//aoffset1 += 2;
boffset3 += 4;
}
}
if (m & 1) {
aoffset1 = aoffset;
boffset1 = boffset;
for(i = (n >> 2); i > 0; i--) {
vl = 8;
v0 = VLEV_FLOAT(aoffset1, vl);
VSEV_FLOAT(boffset1, v0, vl);
aoffset1 += 8;
boffset1 += 8 * m;
}
if (n & 2) {
vl = 4;
v0 = VLEV_FLOAT(aoffset1, vl);
VSEV_FLOAT(boffset2, v0, vl);
aoffset1 += 4;
//boffset2 += 4;
}
if (n & 1) {
*(boffset3) = *(aoffset1);
*(boffset3 + 1) = *(aoffset1 + 1);
}
}
return 0;
}

View File

@ -0,0 +1,74 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
{
BLASLONG i, j;
IFLOAT *aoffset;
IFLOAT *aoffset1;
IFLOAT *boffset;
FLOAT_V_T v0, v1;
size_t vl;
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
aoffset = a;
boffset = b;
for(j = n; j > 0; j -= vl) {
vl = VSETVL(j);
aoffset1 = aoffset;
aoffset += vl * 2;
for(i = m; i > 0; i--) {
VLSEG2_FLOAT(&v0, &v1, aoffset1, vl);
VSSEG2_FLOAT(boffset, v0, v1, vl);
aoffset1 += lda * 2;
boffset += vl * 2;
}
}
return 0;
}

View File

@ -0,0 +1,475 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VFMVVF_FLOAT vfmv_v_f_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#endif
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFMACCVF_FLOAT
#define OP_ii VFNMSACVF_FLOAT
#define OP_ri VFMACCVF_FLOAT
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFMACCVF_FLOAT
#define OP_ii VFMACCVF_FLOAT
#define OP_ri VFNMSACVF_FLOAT
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFNMSACVF_FLOAT
#define OP_ii VFMACCVF_FLOAT
#define OP_ri VFMACCVF_FLOAT
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFNMSACVF_FLOAT
#define OP_ii VFNMSACVF_FLOAT
#define OP_ri VFNMSACVF_FLOAT
#endif
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
//fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug
size_t vl;
for (j = bn/4; j > 0; j--)
{
C0 = C;
C1 = C0 + 2 * ldc;
C2 = C1 + 2 * ldc;
C3 = C2 + 2 * ldc;
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
ptrbb = bb;
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
vres4 = VFMVVF_FLOAT(0.0, vl);
vres5 = VFMVVF_FLOAT(0.0, vl);
vres6 = VFMVVF_FLOAT(0.0, vl);
vres7 = VFMVVF_FLOAT(0.0, vl);
for (k = bk/4; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl);
ptrbb += 8;
VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl);
ptrbb += 8;
vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl);
ptrbb += 8;
}
for (k = (bk & 3); k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
}
VLSEG2_FLOAT(&va0, &va1, C0, vl);
VLSEG2_FLOAT(&va2, &va3, C1, vl);
va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl);
va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl);
VSSEG2_FLOAT(C0, va0, va1, vl);
va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl);
va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl);
va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl);
VSSEG2_FLOAT(C1, va2, va3, vl);
VLSEG2_FLOAT(&va0, &va1, C2, vl);
VLSEG2_FLOAT(&va2, &va3, C3, vl);
va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl);
va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl);
VSSEG2_FLOAT(C2, va0, va1, vl);
va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl);
va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl);
va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl);
VSSEG2_FLOAT(C3, va2, va3, vl);
C0 += vl * 2;
C1 += vl * 2;
C2 += vl * 2;
C3 += vl * 2;
}
bb += (bk << 3);
C += (ldc << 3);
}
if (bn & 2)
{
C0 = C;
C1 = C0 + 2 * ldc;
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
ptrbb = bb;
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
for (k = bk/4; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl);
ptrbb += 4;
VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl);
ptrbb += 4;
vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl);
ptrbb += 4;
}
for (k = (bk & 3); k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
}
VLSEG2_FLOAT(&va0, &va1, C0, vl);
VLSEG2_FLOAT(&va2, &va3, C1, vl);
va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl);
va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl);
VSSEG2_FLOAT(C0, va0, va1, vl);
va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl);
va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl);
va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl);
VSSEG2_FLOAT(C1, va2, va3, vl);
C0 += vl * 2;
C1 += vl * 2;
}
bb += (bk << 2);
C += (ldc << 2);
}
if (bn & 1)
{
C0 = C;
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
ptrbb = bb;
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
for (k = bk/4; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl);
ptrbb += 2;
VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl);
ptrbb += 2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl);
ptrbb += 2;
}
for (k = (bk & 3); k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
}
VLSEG2_FLOAT(&va0, &va1, C0, vl);
va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl);
va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl);
VSSEG2_FLOAT(C0, va0, va1, vl);
C0 += vl * 2;
}
bb += bk << 1;
C += ldc << 1;
}
return 0;
}

View File

@ -0,0 +1,124 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define INT_V_T vint32m2_t
#define VID_V_INT vid_v_i32m2
#define VADD_VX_INT vadd_vx_i32m2
#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
#define VMSLT_VX_INT vmslt_vx_i32m2_b16
#define VMSEQ_VX_INT vmseq_vx_i32m2_b16
#define VBOOL_T vbool16_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
#define VFMVVF_FLOAT vfmv_v_f_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define INT_V_T vint64m2_t
#define VID_V_INT vid_v_i64m2
#define VADD_VX_INT vadd_vx_i64m2
#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
#define VMSLT_VX_INT vmslt_vx_i64m2_b32
#define VMSEQ_VX_INT vmseq_vx_i64m2_b32
#define VBOOL_T vbool32_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
{
//fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
BLASLONG i, js, offset;
FLOAT *ao1, *ao2;
BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0;
INT_V_T vindex_max, vindex;
size_t vl = VSETVL_MAX;
vindex_max = VID_V_INT(vl);
vzero = VFMVVF_FLOAT(ZERO, vl);
for (js = n; js > 0; js -= vl, posX += vl) {
vl = VSETVL(js);
offset = posX - posY;
ao1 = a + posX * 2 + posY * lda * 2;
ao2 = a + posY * 2 + posX * lda * 2;
for (i = m; i > 0; i--, offset--) {
VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl);
VLSEG2_FLOAT(&va10, &va11, ao1, vl);
vindex = VADD_VX_INT(vindex_max, offset, vl);
vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl);
vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl);
vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl);
vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl);
vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl);
vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl);
vb1 = VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl);
vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl);
VSSEG2_FLOAT(b, vb0, vb1, vl);
b += vl * 2;
ao1 += lda * 2;
ao2 += 2;
}
}
return 0;
}

View File

@ -0,0 +1,120 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define INT_V_T vint32m2_t
#define VID_V_INT vid_v_i32m2
#define VADD_VX_INT vadd_vx_i32m2
#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
#define VMSLT_VX_INT vmslt_vx_i32m2_b16
#define VMSEQ_VX_INT vmseq_vx_i32m2_b16
#define VBOOL_T vbool16_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
#define VFMVVF_FLOAT vfmv_v_f_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define INT_V_T vint64m2_t
#define VID_V_INT vid_v_i64m2
#define VADD_VX_INT vadd_vx_i64m2
#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
#define VMSLT_VX_INT vmslt_vx_i64m2_b32
#define VMSEQ_VX_INT vmseq_vx_i64m2_b32
#define VBOOL_T vbool32_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
{
BLASLONG i, js, offset;
FLOAT *ao1, *ao2;
//fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
VBOOL_T vbool_gt0, vbool_eq0;
INT_V_T vindex_max, vindex;
size_t vl = VSETVL_MAX;
vindex_max = VID_V_INT(vl);
vzero = VFMVVF_FLOAT(ZERO, vl);
for (js = n; js > 0; js -= vl, posX += vl) {
vl = VSETVL(js);
offset = posX - posY;
ao1 = a + posY * 2 + posX * lda * 2;
ao2 = a + posX * 2 + posY * lda * 2;
for (i = m; i > 0; i--, offset--) {
VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl);
VLSEG2_FLOAT(&va20, &va21, ao2, vl);
vindex = VADD_VX_INT(vindex_max, offset, vl);
vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl);
vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl);
vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl);
vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl);
vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl);
vb1 = VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl);
vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl);
VSSEG2_FLOAT(b, vb0, vb1, vl);
b += vl * 2;
ao1 += 2;
ao2 += lda * 2;
}
}
return 0;
}

View File

@ -0,0 +1,106 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define INT_V_T vint32m2_t
#define VID_V_INT vid_v_i32m2
#define VADD_VX_INT vadd_vx_i32m2
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
#define VBOOL_T vbool16_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define INT_V_T vint64m2_t
#define VID_V_INT vid_v_i64m2
#define VADD_VX_INT vadd_vx_i64m2
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
#define VBOOL_T vbool32_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
{
BLASLONG i, js, offset;
FLOAT *ao1, *ao2;
BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
VBOOL_T vbool;
INT_V_T vindex_max, vindex;
size_t vl = VSETVL_MAX;
vindex_max = VID_V_INT(vl);
for (js = n; js > 0; js -= vl, posX += vl) {
vl = VSETVL(js);
offset = posX - posY;
ao1 = a + posX * 2 + posY * lda * 2;
ao2 = a + posY * 2 + (posX) * lda * 2;
for (i = m; i > 0; i--, offset--) {
VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl);
VLSEG2_FLOAT(&va10, &va11, ao1, vl);
vindex = VADD_VX_INT(vindex_max, offset, vl);
vbool = VMSGT_VX_INT(vindex, 0, vl);
vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl);
vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl);
VSSEG2_FLOAT(b, vb0, vb1, vl);
b += vl * 2;
ao1 += lda * 2;
ao2 += 2;
}
}
return 0;
}

View File

@ -0,0 +1,106 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define INT_V_T vint32m2_t
#define VID_V_INT vid_v_i32m2
#define VADD_VX_INT vadd_vx_i32m2
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
#define VBOOL_T vbool16_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define INT_V_T vint64m2_t
#define VID_V_INT vid_v_i64m2
#define VADD_VX_INT vadd_vx_i64m2
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
#define VBOOL_T vbool32_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
{
BLASLONG i, js, offset;
FLOAT *ao1, *ao2;
BLASLONG stride_lda = sizeof(FLOAT)*lda * 2;
FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
VBOOL_T vbool;
INT_V_T vindex_max, vindex;
size_t vl = VSETVL_MAX;
vindex_max = VID_V_INT(vl);
for (js = n; js > 0; js -= vl, posX += vl) {
vl = VSETVL(js);
offset = posX - posY;
ao1 = a + posY * 2 + (posX + 0) * lda * 2;
ao2 = a + posX * 2 + 0 + posY * lda * 2;
for (i = m; i > 0; i--, offset--) {
VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl);
VLSEG2_FLOAT(&va20, &va21, ao2, vl);
vindex = VADD_VX_INT(vindex_max, offset, vl);
vbool = VMSGT_VX_INT(vindex, 0, vl);
vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl);
vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl);
VSSEG2_FLOAT(b, vb0, vb1, vl);
b += vl * 2;
ao1 += 2;
ao2 += lda * 2;
}
}
return 0;
}

View File

@ -0,0 +1,145 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vint32m2_t
#define VID_V_UINT vid_v_i32m2
#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, X;
FLOAT *ao;
BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
FLOAT_V_T va0, va1;
size_t vl;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posY * 2 + posX * lda * 2;
}
else
{
ao = a + posX * 2 + posY * lda * 2;
}
i = 0;
do
{
if (X > posY)
{
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
ao += 2;
b += vl * 2;
X ++;
i ++;
}
else if (X < posY)
{
ao += lda * 2;
b += vl * 2;
X ++;
i ++;
}
else
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
#endif
VSSEG2_FLOAT(b, va0, va1, vl);
ao += 2;
b += vl * 2;
}
X += vl;
i += vl;
}
} while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,143 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, X;
FLOAT *ao;
FLOAT_V_T va0, va1;
size_t vl;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posY * 2 + posX * lda * 2;
}
else
{
ao = a + posX * 2 + posY * lda * 2;
}
i = 0;
do
{
if (X > posY)
{
ao += 2;
b += vl * 2;
X++;
i++;
}
else if (X < posY)
{
//va1 = VLEV_FLOAT(ao, vl);
VLSEG2_FLOAT(&va0, &va1, ao, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
ao += lda * 2;
b += vl * 2;
X ++;
i ++;
}
else
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
//va1 = VLEV_FLOAT(ao, vl);
VLSEG2_FLOAT(&va0, &va1, ao, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
#endif
//VSEV_FLOAT(b, vb, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
ao += lda * 2;
b += vl * 2;
}
X += vl;
i += vl;
}
} while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,144 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, X;
BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
FLOAT *ao;
FLOAT_V_T va0, va1;
size_t vl;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posX * 2 + posY * lda * 2;
}
else
{
ao = a + posY * 2 + posX * lda * 2;
}
i = 0;
do
{
if (X < posY)
{
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
ao += 2;
b += vl * 2;
X++;
i++;
}
else if (X > posY)
{
ao += lda * 2;
b += vl * 2;
X++;
i++;
}
else
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
#endif
VSSEG2_FLOAT(b, va0, va1, vl);
ao += 2;
b += vl * 2;
}
X += vl;
i += vl;
}
}while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,140 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, j, js, X;
FLOAT *ao;
FLOAT_V_T va0, va1;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posX * 2 + posY * lda * 2;
}
else
{
ao = a + posY * 2 + posX * lda * 2;
}
i = 0;
do
{
if (X < posY)
{
ao += 2;
b += vl * 2;
X++;
i++;
}
else if (X > posY)
{
VLSEG2_FLOAT(&va0, &va1, ao, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
ao += lda * 2;
b += vl * 2;
X++;
i++;
}
else
{
vindex = VID_V_UINT(vl);
for (j = 0; j < vl; j++)
{
VLSEG2_FLOAT(&va0, &va1, ao, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
#endif
VSSEG2_FLOAT(b, va0, va1, vl);
ao += lda * 2;
b += vl * 2;
}
X += vl;
i += vl;
}
}while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,574 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VFMVVF_FLOAT vfmv_v_f_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#endif
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFMACCVF_FLOAT
#define OP_ii VFNMSACVF_FLOAT
#define OP_ri VFMACCVF_FLOAT
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFMACCVF_FLOAT
#define OP_ii VFMACCVF_FLOAT
#define OP_ri VFNMSACVF_FLOAT
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFNMSACVF_FLOAT
#define OP_ii VFMACCVF_FLOAT
#define OP_ri VFMACCVF_FLOAT
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define OP_rr VFMACCVF_FLOAT
#define OP_ir VFNMSACVF_FLOAT
#define OP_ii VFNMSACVF_FLOAT
#define OP_ri VFNMSACVF_FLOAT
#endif
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#else
off = 0;
#endif
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
//fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug
size_t vl;
for (j = bn/4; j > 0; j--)
{
C0 = C;
C1 = C0 + 2 * ldc;
C2 = C1 + 2 * ldc;
C3 = C2 + 2 * ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*vl*2;
ptrbb = bb + off*4*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
vres4 = VFMVVF_FLOAT(0.0, vl);
vres5 = VFMVVF_FLOAT(0.0, vl);
vres6 = VFMVVF_FLOAT(0.0, vl);
vres7 = VFMVVF_FLOAT(0.0, vl);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+vl; // number of values in A
#else
temp = off+4; // number of values in B
#endif
for (k = temp/4; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl);
ptrbb += 8;
VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl);
ptrbb += 8;
vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl);
ptrbb += 8;
}
for (k = temp & 3; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl);
vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl);
vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl);
vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl);
vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl);
vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl);
vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl);
vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
}
va0 = VFMULVF_FLOAT(vres0, alphar, vl);
va1 = VFMULVF_FLOAT(vres1, alphar, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl);
VSSEG2_FLOAT(C0, va0, va1, vl);
va2 = VFMULVF_FLOAT(vres2, alphar, vl);
va3 = VFMULVF_FLOAT(vres3, alphar, vl);
va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl);
VSSEG2_FLOAT(C1, va2, va3, vl);
va0 = VFMULVF_FLOAT(vres4, alphar, vl);
va1 = VFMULVF_FLOAT(vres5, alphar, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl);
VSSEG2_FLOAT(C2, va0, va1, vl);
va2 = VFMULVF_FLOAT(vres6, alphar, vl);
va3 = VFMULVF_FLOAT(vres7, alphar, vl);
va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl);
VSSEG2_FLOAT(C3, va2, va3, vl);
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= vl; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*vl*2;
ptrbb += temp*4*2;
#endif
#ifdef LEFT
off += vl; // number of values in A
#endif
C0 += vl * 2;
C1 += vl * 2;
C2 += vl * 2;
C3 += vl * 2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 4;
#endif
bb += (bk << 3);
C += (ldc << 3);
}
if (bn & 2)
{
C0 = C;
C1 = C0 + 2 * ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*vl*2;
ptrbb = bb + off*2*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+vl; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k = temp/4; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl);
ptrbb += 4;
VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl);
ptrbb += 4;
vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl);
ptrbb += 4;
}
for (k = temp & 3; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl);
vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl);
vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl);
vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
}
va0 = VFMULVF_FLOAT(vres0, alphar, vl);
va1 = VFMULVF_FLOAT(vres1, alphar, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl);
VSSEG2_FLOAT(C0, va0, va1, vl);
va2 = VFMULVF_FLOAT(vres2, alphar, vl);
va3 = VFMULVF_FLOAT(vres3, alphar, vl);
va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl);
VSSEG2_FLOAT(C1, va2, va3, vl);
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= vl; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*vl*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += vl; // number of values in A
#endif
C0 += vl * 2;
C1 += vl * 2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
bb += (bk << 2);
C += (ldc << 2);
}
if (bn & 1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*vl*2;
ptrbb = bb + off*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+vl; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k = temp/4; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl);
ptrbb += 2;
VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl);
ptrbb += 2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl);
ptrbb += 2;
}
for (k = temp & 3; k > 0; k--)
{
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
ptrba += vl*2;
vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl);
vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl);
vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl);
vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
}
va0 = VFMULVF_FLOAT(vres0, alphar, vl);
va1 = VFMULVF_FLOAT(vres1, alphar, vl);
va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl);
VSSEG2_FLOAT(C0, va0, va1, vl);
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= vl; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*vl*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += vl; // number of values in A
#endif
C0 += vl * 2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
bb += bk << 1;
C += ldc << 1;
}
return 0;
}

View File

@ -0,0 +1,115 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
BLASLONG i, ii, jj, js;
FLOAT *ao;
jj = offset;
BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
FLOAT_V_T va0, va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
ao += 2;
b += vl * 2;
}
i += vl;
ii += vl;
}
else
{
if (ii > jj)
{
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
}
ao += 2;
b += vl * 2;
i++;
ii++;
}
}
a += vl * lda * 2;
jj += vl;
}
return 0;
}

View File

@ -0,0 +1,114 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
BLASLONG i, ii, jj, js;
FLOAT *ao;
jj = offset;
FLOAT_V_T va0, va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
VLSEG2_FLOAT(&va0, &va1, ao, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
b += vl * 2;
ao += lda * 2;
}
i += vl;
ii += vl;
}
else
{
if (ii < jj)
{
VLSEG2_FLOAT(&va0, &va1, ao, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
}
ao += lda * 2;
b += vl * 2;
i ++;
ii ++;
}
}
a += vl * 2;
jj += vl;
}
return 0;
}

View File

@ -0,0 +1,113 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
BLASLONG i, ii, jj, js;
BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
FLOAT *ao;
jj = offset;
FLOAT_V_T va0, va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
i = 0;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
ao += 2;
b += vl * 2;
}
i += vl;
ii += vl;
}
else
{
if (ii < jj)
{
VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
}
ao += 2;
b += vl * 2;
i++;
ii++;
}
}
a += vl * lda * 2;
jj += vl;
}
return 0;
}

View File

@ -0,0 +1,115 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
BLASLONG i, ii, jj, js;
FLOAT *ao;
jj = offset;
FLOAT_V_T va0, va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
VLSEG2_FLOAT(&va0, &va1, ao, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
ao += lda * 2;
b += vl * 2;
}
i += vl;
ii += vl;
}
else
{
if (ii > jj)
{
VLSEG2_FLOAT(&va0, &va1, ao, vl);
VSSEG2_FLOAT(b, va0, va1, vl);
}
ao += lda * 2;
b += vl * 2;
i ++;
ii ++;
}
}
a += vl * 2;
jj += vl;
}
return 0;
}

10
param.h
View File

@ -3055,11 +3055,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4
#define DGEMM_DEFAULT_UNROLL_MN 32
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_MN 16
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_MN 16
#define SGEMM_DEFAULT_P 160
#define DGEMM_DEFAULT_P 160