* update intrinsics to match latest spec at https://github.com/riscv-non-isa/rvv-intrinsic-doc (in particular, __riscv_ prefixes for rvv intrinsics)

* fix multiple numerical stability and corner case issues
* add a script to generate arbitrary gemm kernel shapes
* add a generic zvl256b target to demonstrate large gemm kernel unrolls
This commit is contained in:
Sergei Lewis 2023-02-24 10:44:55 +00:00
parent c19dff0a31
commit 2406958629
58 changed files with 18648 additions and 2388 deletions

View File

@ -91,12 +91,15 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define BUFFER_SIZE ( 32 << 20)
#define SEEK_ADDRESS
#if defined(C910V)
#include <riscv_vector.h>
#endif
#if defined(x280)
#include <riscv_vector.h>
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v)
# include <riscv_vector.h>
# if !defined(DOUBLE)
# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v)
# else
# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v)
# endif
#else
# define EXTRACT_FLOAT(v) (v[0])
#endif
#endif

View File

@ -70,14 +70,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define CPU_GENERIC 0
#define CPU_C910V 1
#define CPU_x280 2
#define CPU_GENERIC 0
#define CPU_C910V 1
#define CPU_RISCV64_ZVL256B 2
static char *cpuname[] = {
"RISCV64_GENERIC",
"C910V"
"x280"
"C910V",
"CPU_RISCV64_ZVL256B"
};
int detect(void){

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,199 @@
SAMAXKERNEL = amax_vector.c
DAMAXKERNEL = amax_vector.c
CAMAXKERNEL = zamax_vector.c
ZAMAXKERNEL = zamax_vector.c
SAMINKERNEL = amin_vector.c
DAMINKERNEL = amin_vector.c
CAMINKERNEL = zamin_vector.c
ZAMINKERNEL = zamin_vector.c
SMAXKERNEL = max_vector.c
DMAXKERNEL = max_vector.c
SMINKERNEL = min_vector.c
DMINKERNEL = min_vector.c
ISAMAXKERNEL = iamax_vector.c
IDAMAXKERNEL = iamax_vector.c
ICAMAXKERNEL = izamax_vector.c
IZAMAXKERNEL = izamax_vector.c
ISAMINKERNEL = iamin_vector.c
IDAMINKERNEL = iamin_vector.c
ICAMINKERNEL = izamin_vector.c
IZAMINKERNEL = izamin_vector.c
ISMAXKERNEL = imax_vector.c
IDMAXKERNEL = imax_vector.c
ISMINKERNEL = imin_vector.c
IDMINKERNEL = imin_vector.c
SASUMKERNEL = asum_vector.c
DASUMKERNEL = asum_vector.c
CASUMKERNEL = zasum_vector.c
ZASUMKERNEL = zasum_vector.c
SSUMKERNEL = sum_vector.c
DSUMKERNEL = sum_vector.c
CSUMKERNEL = zsum_vector.c
ZSUMKERNEL = zsum_vector.c
SAXPYKERNEL = axpy_vector.c
DAXPYKERNEL = axpy_vector.c
CAXPYKERNEL = zaxpy_vector.c
ZAXPYKERNEL = zaxpy_vector.c
SCOPYKERNEL = copy_vector.c
DCOPYKERNEL = copy_vector.c
CCOPYKERNEL = zcopy_vector.c
ZCOPYKERNEL = zcopy_vector.c
SDOTKERNEL = dot_vector.c
DDOTKERNEL = dot_vector.c
CDOTKERNEL = zdot_vector.c
ZDOTKERNEL = zdot_vector.c
DSDOTKERNEL = ../generic/dot.c
SNRM2KERNEL = nrm2_vector.c
DNRM2KERNEL = nrm2_vector.c
CNRM2KERNEL = znrm2_vector.c
ZNRM2KERNEL = znrm2_vector.c
SROTKERNEL = rot_vector.c
DROTKERNEL = rot_vector.c
CROTKERNEL = zrot_vector.c
ZROTKERNEL = zrot_vector.c
SSCALKERNEL = scal_vector.c
DSCALKERNEL = scal_vector.c
CSCALKERNEL = zscal_vector.c
ZSCALKERNEL = zscal_vector.c
SSWAPKERNEL = swap_vector.c
DSWAPKERNEL = swap_vector.c
CSWAPKERNEL = zswap_vector.c
ZSWAPKERNEL = zswap_vector.c
SGEMVNKERNEL = gemv_n_vector.c
DGEMVNKERNEL = gemv_n_vector.c
CGEMVNKERNEL = zgemv_n_vector.c
ZGEMVNKERNEL = zgemv_n_vector.c
SGEMVTKERNEL = gemv_t_vector.c
DGEMVTKERNEL = gemv_t_vector.c
CGEMVTKERNEL = zgemv_t_vector.c
ZGEMVTKERNEL = zgemv_t_vector.c
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SSYMV_U_KERNEL = symv_U_vector.c
SSYMV_L_KERNEL = symv_L_vector.c
DSYMV_U_KERNEL = symv_U_vector.c
DSYMV_L_KERNEL = symv_L_vector.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
CHEMV_L_KERNEL = zhemv_LM_vector.c
CHEMV_M_KERNEL = zhemv_LM_vector.c
CHEMV_U_KERNEL = zhemv_UV_vector.c
CHEMV_V_KERNEL = zhemv_UV_vector.c
ZHEMV_L_KERNEL = zhemv_LM_vector.c
ZHEMV_M_KERNEL = zhemv_LM_vector.c
ZHEMV_U_KERNEL = zhemv_UV_vector.c
ZHEMV_V_KERNEL = zhemv_UV_vector.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif

View File

@ -28,36 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMAXVV_FLOAT vfmax_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@ -65,103 +66,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT maxf=0.0;
if (n <= 0 || inc_x <= 0) return(maxf);
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_max;
FLOAT_V_T_M1 v_res, v_zero;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_zero = VFMVVF_FLOAT_M1(0, gvl);
FLOAT_V_T v0, v1;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, 1);
MASK_T mask0, mask1;
FLOAT zero = 0.0;
if(inc_x == 1){
gvl = VSETVL(n);
if(gvl <= n/2){
v_max = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLEV_FLOAT(&x[j], gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
j += gvl*2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
maxf = *((FLOAT*)&v_res);
//maxf = v_res[0];
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
if(*((FLOAT*)&v_res) > maxf)
maxf = *((FLOAT*)&v_res);
v0 = VFABS_FLOAT(v0, gvl);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
@ -169,94 +95,27 @@ asm volatile(
BLASLONG stride_x = inc_x * sizeof(FLOAT);
if(gvl <= n/2){
BLASLONG inc_xv = inc_x * gvl;
v_max = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
j += gvl*2;
ix += inc_xv*2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
maxf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
if(*((FLOAT*)&v_res) > maxf)
maxf = *((FLOAT*)&v_res);
v0 = VFABS_FLOAT(v0, gvl);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}
maxf = EXTRACT_FLOAT(v_res);
return(maxf);
}

View File

@ -26,232 +26,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMINVV_FLOAT vfmin_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMINVV_FLOAT vfmin_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
if (n <= 0 || inc_x <= 0) return(0.0);
FLOAT minf=FLT_MAX;
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_min;
FLOAT_V_T_M1 v_res, v_max;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
BLASLONG i=0, j=0;
BLASLONG ix=0;
FLOAT minf=0.0;
if (n <= 0 || inc_x <= 0) return(minf);
minf = *x;
x += inc_x;
--n;
if (n == 0) return(minf);
unsigned int gvl = 0;
FLOAT_V_T v0, v1;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(minf, 1);
MASK_T mask0, mask1;
FLOAT zero = 0.0;
if(inc_x == 1){
gvl = VSETVL(n);
if(gvl <= n/2){
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
j += gvl*2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
if(*((FLOAT*)&v_res) < minf)
minf = *((FLOAT*)&v_res);
v0 = VFABS_FLOAT(v0, gvl);
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * sizeof(FLOAT);
if(gvl <= n/2){
BLASLONG idx = 0, inc_xv = inc_x * gvl;
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
BLASLONG inc_xv = inc_x * gvl;
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v1)
:"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
j += gvl*2;
idx += inc_xv*2;
ix += inc_xv*2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vsetvli zero, zero, e8, m1\n\t"
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+vd"(v0)
:"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
if(*((FLOAT*)&v_res) < minf)
minf = *((FLOAT*)&v_res);
v0 = VFABS_FLOAT(v0, gvl);
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}
return(minf);
minf = EXTRACT_FLOAT(v_res);
return(minf);
}

View File

@ -28,35 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFADDVV_FLOAT vfadd_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFADDVV_FLOAT vfadd_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@ -64,75 +67,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT asumf=0.0;
if (n <= 0 || inc_x <= 0) return(asumf);
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_zero,v_sum;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
FLOAT_V_T v0, v1, v_sum;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, 1);
MASK_T mask0, mask1;
if(inc_x == 1){
gvl = VSETVL(n);
v_zero = VFMVVF_FLOAT(0, gvl);
if(gvl <= n/2){
v_sum = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl * 2;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
asumf += *((FLOAT*)&v_res);
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
asumf += *((FLOAT*)&v_res);
v0 = VFABS_FLOAT(v0, gvl);
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
gvl = VSETVL(n);
unsigned int stride_x = inc_x * sizeof(FLOAT);
v_zero = VFMVVF_FLOAT(0, gvl);
if(gvl <= n/2){
v_sum = VFMVVF_FLOAT(0, gvl);
BLASLONG inc_xv = inc_x * gvl;
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl * 2;
inc_xv += inc_xv * 2;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
asumf += *((FLOAT*)&v_res);
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
asumf += *((FLOAT*)&v_res);
v0 = VFABS_FLOAT(v0, gvl);
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}
asumf = EXTRACT_FLOAT(v_res);
return(asumf);
}

View File

@ -27,28 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
# define LMUL m4
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL)
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL)
#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMULVF_FLOAT JOIN(__riscv_vfmul, _vf_f, ELEN, LMUL, _)
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
{
if (n < 0) return(0);

View File

@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
# define LMUL m4
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL)
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL)
#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _)
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0, j=0, jx=0, jy=0;

File diff suppressed because it is too large Load Diff

View File

@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# else
# define ELEN 32
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL)
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL)
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0, j=0;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,860 @@
/*
AUTOGENERATED KERNEL
Settings:
LMUL=1
M=8
M_tail_scalar_from=2
N=8
__riscv_='__riscv_'
complex=False
conjugate=False
cpu='zvl256b'
force_acc_double=False
index_type='BLASLONG'
op='gemm'
param_precision='double'
reg_width_bits=256
tail_policy=''
trace=False
Derived:
ELEN_ACC=64
ELEN_PARAM=64
LMUL_ACC=1
VFMACC='__riscv_vfmacc_vf_f64m1'
VFMUL='__riscv_vfmul_vf_f64m1'
VLEV='__riscv_vle64_v_f64m1'
VLSEV='__riscv_vlse64_v_f64m1'
VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
VSETVL='__riscv_vsetvl_e64m1'
VSEV='__riscv_vse64_v_f64m1'
VSSEV='__riscv_vsse64_v_f64m1'
acc_vector_t='vfloat64m1_t'
output='dgemm_kernel_8x8_zvl256b.c'
param_scalar_t='double'
param_vector_t='vfloat64m1_t'
*/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
{
BLASLONG gvl = 0;
BLASLONG m_top = 0;
BLASLONG n_top = 0;
// -- MAIN PASS
for (BLASLONG j=0; j<N/8; j+=1) {
m_top = 0;
BLASLONG gvl = __riscv_vsetvl_e64m1(4);
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
double B1 = B[bi+1];
double B2 = B[bi+2];
double B3 = B[bi+3];
double B4 = B[bi+4];
double B5 = B[bi+5];
double B6 = B[bi+6];
double B7 = B[bi+7];
bi += 8;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
B2 = B[bi+2];
B3 = B[bi+3];
B4 = B[bi+4];
B5 = B[bi+5];
B6 = B[bi+6];
B7 = B[bi+7];
bi += 8;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl );
c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl );
c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl );
c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl );
c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl );
c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl );
c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl );
c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c15, gvl);
m_top += 8;
}
// -- tails for main pass
if( M & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
double B1 = B[bi+1];
double B2 = B[bi+2];
double B3 = B[bi+3];
double B4 = B[bi+4];
double B5 = B[bi+5];
double B6 = B[bi+6];
double B7 = B[bi+7];
bi += 8;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
B2 = B[bi+2];
B3 = B[bi+3];
B4 = B[bi+4];
B5 = B[bi+5];
B6 = B[bi+6];
B7 = B[bi+7];
bi += 8;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c7, gvl);
m_top += 4;
}
if( M & 2 ) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
double result4 = 0;
double result5 = 0;
double result6 = 0;
double result7 = 0;
double result8 = 0;
double result9 = 0;
double result10 = 0;
double result11 = 0;
double result12 = 0;
double result13 = 0;
double result14 = 0;
double result15 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
result1+=A[ai+1]*B[bi+0];
result2+=A[ai+0]*B[bi+1];
result3+=A[ai+1]*B[bi+1];
result4+=A[ai+0]*B[bi+2];
result5+=A[ai+1]*B[bi+2];
result6+=A[ai+0]*B[bi+3];
result7+=A[ai+1]*B[bi+3];
result8+=A[ai+0]*B[bi+4];
result9+=A[ai+1]*B[bi+4];
result10+=A[ai+0]*B[bi+5];
result11+=A[ai+1]*B[bi+5];
result12+=A[ai+0]*B[bi+6];
result13+=A[ai+1]*B[bi+6];
result14+=A[ai+0]*B[bi+7];
result15+=A[ai+1]*B[bi+7];
ai+=2;
bi+=8;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
C[ci+0*ldc+1] += alpha * result1;
C[ci+1*ldc+0] += alpha * result2;
C[ci+1*ldc+1] += alpha * result3;
C[ci+2*ldc+0] += alpha * result4;
C[ci+2*ldc+1] += alpha * result5;
C[ci+3*ldc+0] += alpha * result6;
C[ci+3*ldc+1] += alpha * result7;
C[ci+4*ldc+0] += alpha * result8;
C[ci+4*ldc+1] += alpha * result9;
C[ci+5*ldc+0] += alpha * result10;
C[ci+5*ldc+1] += alpha * result11;
C[ci+6*ldc+0] += alpha * result12;
C[ci+6*ldc+1] += alpha * result13;
C[ci+7*ldc+0] += alpha * result14;
C[ci+7*ldc+1] += alpha * result15;
m_top+=2;
}
if( M & 1 ) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
double result4 = 0;
double result5 = 0;
double result6 = 0;
double result7 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
result1+=A[ai+0]*B[bi+1];
result2+=A[ai+0]*B[bi+2];
result3+=A[ai+0]*B[bi+3];
result4+=A[ai+0]*B[bi+4];
result5+=A[ai+0]*B[bi+5];
result6+=A[ai+0]*B[bi+6];
result7+=A[ai+0]*B[bi+7];
ai+=1;
bi+=8;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
C[ci+1*ldc+0] += alpha * result1;
C[ci+2*ldc+0] += alpha * result2;
C[ci+3*ldc+0] += alpha * result3;
C[ci+4*ldc+0] += alpha * result4;
C[ci+5*ldc+0] += alpha * result5;
C[ci+6*ldc+0] += alpha * result6;
C[ci+7*ldc+0] += alpha * result7;
m_top+=1;
}
n_top += 8;
}
// -- tails for N=4
if( N & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
m_top = 0;
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
double B1 = B[bi+1];
double B2 = B[bi+2];
double B3 = B[bi+3];
bi += 4;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
B2 = B[bi+2];
B3 = B[bi+3];
bi += 4;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c7, gvl);
m_top += 8;
}
if( M & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
double B1 = B[bi+1];
double B2 = B[bi+2];
double B3 = B[bi+3];
bi += 4;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
B2 = B[bi+2];
B3 = B[bi+3];
bi += 4;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c3, gvl);
m_top += 4;
}
if( M & 2 ) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
double result4 = 0;
double result5 = 0;
double result6 = 0;
double result7 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
result1+=A[ai+1]*B[bi+0];
result2+=A[ai+0]*B[bi+1];
result3+=A[ai+1]*B[bi+1];
result4+=A[ai+0]*B[bi+2];
result5+=A[ai+1]*B[bi+2];
result6+=A[ai+0]*B[bi+3];
result7+=A[ai+1]*B[bi+3];
ai+=2;
bi+=4;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
C[ci+0*ldc+1] += alpha * result1;
C[ci+1*ldc+0] += alpha * result2;
C[ci+1*ldc+1] += alpha * result3;
C[ci+2*ldc+0] += alpha * result4;
C[ci+2*ldc+1] += alpha * result5;
C[ci+3*ldc+0] += alpha * result6;
C[ci+3*ldc+1] += alpha * result7;
m_top+=2;
}
if( M & 1 ) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
result1+=A[ai+0]*B[bi+1];
result2+=A[ai+0]*B[bi+2];
result3+=A[ai+0]*B[bi+3];
ai+=1;
bi+=4;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
C[ci+1*ldc+0] += alpha * result1;
C[ci+2*ldc+0] += alpha * result2;
C[ci+3*ldc+0] += alpha * result3;
m_top+=1;
}
n_top += 4;
}
// -- tails for N=2
if( N & 2 ) {
gvl = __riscv_vsetvl_e64m1(4);
m_top = 0;
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
double B1 = B[bi+1];
bi += 2;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
bi += 2;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
__riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c3, gvl);
m_top += 8;
}
if( M & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
double B1 = B[bi+1];
bi += 2;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
B1 = B[bi+1];
bi += 2;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse64_v_f64m1( &C[ci], c1, gvl);
m_top += 4;
}
if( M & 2 ) {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
result1+=A[ai+1]*B[bi+0];
result2+=A[ai+0]*B[bi+1];
result3+=A[ai+1]*B[bi+1];
ai+=2;
bi+=2;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
C[ci+0*ldc+1] += alpha * result1;
C[ci+1*ldc+0] += alpha * result2;
C[ci+1*ldc+1] += alpha * result3;
m_top+=2;
}
if( M & 1 ) {
double result0 = 0;
double result1 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
result1+=A[ai+0]*B[bi+1];
ai+=1;
bi+=2;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
C[ci+1*ldc+0] += alpha * result1;
m_top+=1;
}
n_top += 2;
}
// -- tails for N=1
if( N & 1 ) {
gvl = __riscv_vsetvl_e64m1(4);
m_top = 0;
for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
bi += 1;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
bi += 1;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
ai += 8;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
__riscv_vse64_v_f64m1( &C[ci], c1, gvl);
m_top += 8;
}
if( M & 4 ) {
gvl = __riscv_vsetvl_e64m1(4);
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
double B0 = B[bi+0];
bi += 1;
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
for(BLASLONG k=1; k<K; k++) {
B0 = B[bi+0];
bi += 1;
A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
ai += 4;
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
}
BLASLONG ci=n_top*ldc+m_top;
vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl);
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
ci=n_top*ldc+m_top;
__riscv_vse64_v_f64m1( &C[ci], c0, gvl);
m_top += 4;
}
if( M & 2 ) {
double result0 = 0;
double result1 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
result1+=A[ai+1]*B[bi+0];
ai+=2;
bi+=1;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
C[ci+0*ldc+1] += alpha * result1;
m_top+=2;
}
if( M & 1 ) {
double result0 = 0;
BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
for(BLASLONG k=0; k<K; k++) {
result0+=A[ai+0]*B[bi+0];
ai+=1;
bi+=1;
}
BLASLONG ci=n_top*ldc+m_top;
C[ci+0*ldc+0] += alpha * result0;
m_top+=1;
}
n_top += 1;
}
return 0;
}

View File

@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG ix=0,iy=0;
double dot = 0.0 ;
if ( n < 0 ) return(dot);
if ( n < 1 ) return(dot);
while(i < n)
{

View File

@ -27,31 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFDOTVV_FLOAT vfdot_vv_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFDOTVV_FLOAT vfdot_vv_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
#endif
#if defined(DSDOT)
@ -63,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG i=0, j=0;
double dot = 0.0 ;
if ( n < 0 ) return(dot);
if ( n < 1 ) return(dot);
FLOAT_V_T vr, vx, vy;
unsigned int gvl = 0;
@ -82,8 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
j += gvl;
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
//tail
if(j < n){
@ -93,13 +91,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
}else if(inc_y == 1){
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
int stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
vy = VLEV_FLOAT(&y[j], gvl);
@ -107,9 +105,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
j += gvl;
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
//tail
if(j < n){
@ -119,14 +116,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
}else if(inc_x == 1){
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
int stride_y = inc_y * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = VLEV_FLOAT(&x[j], gvl);
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@ -134,9 +130,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
j += gvl;
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
//tail
if(j < n){
@ -146,15 +141,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
}else{
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
int stride_x = inc_x * sizeof(FLOAT);
int stride_y = inc_y * sizeof(FLOAT);
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@ -162,9 +156,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
j += gvl;
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
//tail
if(j < n){
@ -174,9 +167,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
dot += (double)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
dot += (double)EXTRACT_FLOAT(v_res);
}
}
return(dot);

File diff suppressed because it is too large Load Diff

View File

@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSEV_FLOAT __riscv_vse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSEV_FLOAT __riscv_vse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)

View File

@ -27,107 +27,102 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFDOTVV_FLOAT vfdot_vv_f32m4
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m2
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2
#define xint_t int
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFDOTVV_FLOAT vfdot_vv_f64m4
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m2
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2
#define xint_t long long
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i = 0, j = 0, k = 0;
BLASLONG ix = 0, iy = 0;
FLOAT *a_ptr = a;
BLASLONG i = 0, j = 0, k = 0;
BLASLONG ix = 0, iy = 0;
FLOAT *a_ptr = a;
FLOAT temp;
FLOAT_V_T va, vr, vx;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
BLASLONG gvl = 0;
FLOAT_V_T_M1 v_res;
if(inc_x == 1){
for(i = 0; i < n; i++){
v_res = VFMVVF_FLOAT_M1(0, 1);
gvl = VSETVL(m);
j = 0;
vr = VFMVVF_FLOAT(0, gvl);
for(k = 0; k < m/gvl; k++){
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLEV_FLOAT(&x[j], gvl);
vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
j += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp = (FLOAT)VFMVFS_FLOAT(v_res);
if(j < m){
gvl = VSETVL(m-j);
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLEV_FLOAT(&x[j], gvl);
vr = VFMULVV_FLOAT(va, vx, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp += (FLOAT)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
}
temp = (FLOAT)EXTRACT_FLOAT(v_res);
y[iy] += alpha * temp;
iy += inc_y;
a_ptr += lda;
}
}else{
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for(i = 0; i < n; i++){
v_res = VFMVVF_FLOAT_M1(0, 1);
gvl = VSETVL(m);
BLASLONG inc_xv = inc_x * gvl;
j = 0;
ix = 0;
vr = VFMVVF_FLOAT(0, gvl);
for(k = 0; k < m/gvl; k++){
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
vr = VFMULVV_FLOAT(va, vx, gvl);
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
j += gvl;
ix += inc_xv;
ix += inc_x * gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp = (FLOAT)VFMVFS_FLOAT(v_res);
if(j < m){
gvl = VSETVL(m-j);
va = VLEV_FLOAT(&a_ptr[j], gvl);
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(va, vx, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp += (FLOAT)VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
}
temp = (FLOAT)EXTRACT_FLOAT(v_res);
y[iy] += alpha * temp;
iy += inc_y;
a_ptr += lda;
}
}
return(0);
}

670
kernel/riscv64/generate_kernel.py Executable file
View File

@ -0,0 +1,670 @@
#!/usr/bin/python3
import sys, os
import contextlib
#-----------------------------------------------------------------------
def ERROR(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
sys.exit(-1)
class Target(object):
def __init__( self, out, mappings, initial_level=0, tab_width=4 ):
self._level = initial_level
self._tab_width = tab_width
self._out = out
self._mappings = mappings
@contextlib.contextmanager
def map( self, **items ):
old_mappings = self._mappings
self._mappings = dict(old_mappings, **items)
yield self._mappings
self._mappings = old_mappings
@contextlib.contextmanager
def block( self, start=None, end=None, **args ):
with self.map(**args):
if start is not None:
self.write();
self.write(start)
self._level += 1
yield self._level
self._level -= 1
if end is not None:
self.write(end)
self.write()
def write( self, fmt=None, *args, **kwargs ):
if fmt is not None:
mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings
self._out(self._indent_str() + fmt.format(*args, **mappings))
else:
self._out("")
def _indent_str( self ):
return ' ' * (self._level * self._tab_width)
#-----------------------------------------------------------------------
def generate_trmm_block( dest ):
dest.write("{index_type} pass_K = K;")
dest.write("#ifdef LEFT")
with dest.block():
dest.write("{index_type} off = offset + m_top;")
dest.write("#else")
with dest.block():
dest.write("{index_type} off = -offset + n_top;")
dest.write("#endif")
dest.write("#ifdef BACKWARDS")
with dest.block():
dest.write("ai += off*{M}{elt_size};")
dest.write("bi += off*{N}{elt_size};")
dest.write("pass_K -= off;")
dest.write("#else")
with dest.block():
dest.write("#ifdef LEFT")
with dest.block():
dest.write("pass_K = off + {M};")
dest.write("#else")
with dest.block():
dest.write("pass_K = off + {N};")
dest.write("#endif")
dest.write("#endif")
#-----------------------------------------------------------------------
def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ):
TRMM = (settings['op'].value == 'trmm')
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
with dest.map(
M=M,
N=N,
):
dest.write("{index_type} ai=m_top*K{elt_size};")
dest.write("{index_type} bi=n_top*K{elt_size};")
if TRMM:
generate_trmm_block( dest )
for i in range(N):
dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i)
dest.write("bi += {N};")
dest.write()
for i in range(a_regs):
dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
dest.write("ai += {M};")
dest.write()
for j in range(N):
for i in range(a_regs):
dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j)
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
for i in range(N):
dest.write("B{i} = B[bi+{i}];", i=i )
dest.write("bi += {N};")
dest.write()
for i in range(a_regs):
dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
dest.write("ai += {M};")
dest.write()
for j in range(N):
for i in range(a_regs):
dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i )
dest.write()
dest.write("{index_type} ci=n_top*ldc+m_top;")
dest.write()
if narrow_result:
for j in range(N):
for i in range(a_regs):
dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i)
if not TRMM:
for j in range(N):
for i in range(a_regs):
idx = j*a_regs+i
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
if idx == N*a_regs-1:
increment = ''
dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment)
if narrow_result:
for j in range(N):
for i in range(a_regs):
idx = j*a_regs+i
if TRMM:
dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx)
else:
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx)
else:
for j in range(N):
for i in range(a_regs):
idx = j*a_regs+i
if TRMM:
dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx)
else:
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx)
if not TRMM:
dest.write()
dest.write("ci=n_top*ldc+m_top;")
dest.write()
for j in range(N):
for i in range(a_regs):
idx = j*a_regs+i
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
if idx == N*a_regs-1:
increment = ''
dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment)
#-----------------------------------------------------------------------
def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ):
TRMM = (settings['op'].value == 'trmm')
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
if narrow_result:
raise RuntimeError("wide accumulator not supported for generated complex kernels")
# we could, but we run out of registers really really fast
with dest.map(
M=M,
N=N,
):
dest.write("{index_type} ai=m_top*K*2;")
dest.write("{index_type} bi=n_top*K*2;")
if TRMM:
generate_trmm_block( dest )
for i in range(N):
dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i)
dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i)
dest.write("bi += {N}*2;")
dest.write()
for i in range(a_regs):
dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
dest.write("ai += {M}*2;")
dest.write()
accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k",
a_regs=a_regs*2, accumulation_regs=accumulation_regs*2
)
pass_regs = (accumulation_regs + a_regs)*2
tmp_regs = 32-pass_regs
if tmp_regs < 2:
raise RuntimeError("Complex kernel would use too many registers!")
dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs)
tmp_unroll_i = min(tmp_regs, a_regs)
tmp_unroll_j = N
while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j:
tmp_unroll_j = int(tmp_unroll_j / 2)
if tmp_unroll_i < a_regs or tmp_unroll_j < N:
dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i)
for tj in range(0, N, tmp_unroll_j):
for ti in range(0, a_regs, tmp_unroll_i):
for j in range(tj, tj+tmp_unroll_j):
for i in range(ti, ti+tmp_unroll_i):
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
if ti == 0 and tj==0:
dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
else:
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
for j in range(tj, tj+tmp_unroll_j):
for i in range(ti, ti+tmp_unroll_i):
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
for j in range(tj, tj+tmp_unroll_j):
for i in range(ti, ti+tmp_unroll_i):
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;")
dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;")
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
for i in range(N):
dest.write("B{i}r = B[bi+{i}*2+0];", i=i)
dest.write("B{i}i = B[bi+{i}*2+1];", i=i)
dest.write("bi += {N}*2;")
dest.write()
for i in range(a_regs):
dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
dest.write("ai += {M}*2;")
dest.write()
for tj in range(0, N, tmp_unroll_j):
for ti in range(0, a_regs, tmp_unroll_i):
# note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign
# so accumulating them directly to ACC would lose precision when ACC is larger
for j in range(tj, tj+tmp_unroll_j):
for i in range(ti, ti+tmp_unroll_i):
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
for j in range(tj, tj+tmp_unroll_j):
for i in range(ti, ti+tmp_unroll_i):
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
for j in range(tj, tj+tmp_unroll_j):
for i in range(ti, ti+tmp_unroll_i):
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);")
dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);")
dest.write()
dest.write("{index_type} ci=n_top*ldc+m_top;")
dest.write()
for j in range(N):
if TRMM:
for i in range(a_regs):
with dest.map(idx=j*a_regs+i):
dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );")
dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );")
else:
for i in range(a_regs):
idx = j*a_regs+i
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
if idx == N*a_regs-1:
increment = ''
with dest.map(idx=j*a_regs+i, increment=increment):
dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );")
dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );")
dest.write("{increment}")
if not TRMM:
for j in range(N):
for i in range(a_regs):
with dest.map(idx=j*a_regs+i):
dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );")
dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );")
for j in range(N):
for i in range(a_regs):
with dest.map(idx=j*a_regs+i):
dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );")
dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );")
if not TRMM:
dest.write()
dest.write("ci=n_top*ldc+m_top;")
dest.write()
for j in range(N):
for i in range(a_regs):
idx = j*a_regs+i
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
if idx == N*a_regs-1:
increment = ''
with dest.map(idx=j*a_regs+i, increment=increment):
dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);")
dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);")
dest.write("{increment}")
#-----------------------------------------------------------------------
def generate_gemm_kernel( settings, OUTPUT ):
if settings['conjugate'].value:
ERROR('conjugate gemm not yet supported')
is_complex = settings['complex'].value
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings })
M = settings['M'].value
N = settings['N'].value
vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
a_regs = max(int(M/vlenmax), 1)
accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
required_regs = accumulation_regs + a_regs
if is_complex:
required_regs = required_regs * 2 + 2
dest.write('''
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define S0 1
#define S1 -1
#define S2 1
#define S3 1
#define VFMACC_RR __riscv_vfmsac{tail_policy}
#define VFMACC_RI __riscv_vfmacc{tail_policy}
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define S0 1
#define S1 1
#define S2 1
#define S3 -1
#define VFMACC_RR __riscv_vfmacc{tail_policy}
#define VFMACC_RI __riscv_vfmsac{tail_policy}
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define S0 1
#define S1 1
#define S2 -1
#define S3 1
#define VFMACC_RR __riscv_vfmacc{tail_policy}
#define VFMACC_RI __riscv_vfnmsac{tail_policy}
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define S0 1
#define S1 -1
#define S2 -1
#define S3 -1
#define VFMACC_RR __riscv_vfmsac{tail_policy}
#define VFMACC_RI __riscv_vfnmacc{tail_policy}
#endif
'''.format(tail_policy=settings['tail_policy'].value))
if required_regs > 32:
raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format(
required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '')
))
TRMM = (settings['op'].value == 'trmm')
if TRMM:
with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"):
dest.write("#define BACKWARDS")
dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})",
alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'),
trmm = (', BLASLONG offset' if TRMM else '')
)
with dest.block("{{", "}}", elt_size='*2' if is_complex else ''):
if settings['trace'].value:
dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);")
dest.write("{index_type} gvl = 0;")
dest.write("{index_type} m_top = 0;")
dest.write("{index_type} n_top = 0;")
dest.write()
dest.write()
dest.write("// -- MAIN PASS")
with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"):
dest.write("m_top = 0;")
dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
dest.write()
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs )
dest.write( "m_top += {M};" )
dest.write()
dest.write()
dest.write("// -- tails for main pass")
generate_M_tails( dest, settings, M, N )
dest.write( "n_top += {N};" )
N_tail = int(N/2)
while( N_tail > 0 ):
with dest.map(N=N_tail):
dest.write()
dest.write()
dest.write("// -- tails for N={N}")
with dest.block("if( N & {N} ) {{", "}}" ):
if settings['trace'].value:
dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
dest.write("m_top = 0;")
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs )
dest.write("m_top += {M};")
generate_M_tails( dest, settings, M, N_tail )
dest.write("n_top += {N};")
N_tail = int(N_tail/2)
dest.write("return 0;");
#-----------------------------------------------------------------------
def generate_M_tails( dest, settings, M, N ):
M_tail = int(M/2)
M_tail_min = settings['M_tail_scalar_from'].value
vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
TRMM = (settings['op'].value == 'trmm')
is_complex = settings['complex'].value
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
while( M_tail > M_tail_min ):
with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ):
if settings['trace'].value:
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
a_regs = max( 1, int(M_tail/vlenmax) )
vlen = int(M_tail/a_regs)
dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen)
generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs )
dest.write( "m_top += {M_tail};" )
M_tail = int( M_tail / 2 )
while( M_tail > 0 ):
with dest.block("if( M & {M_tail} ) {{", "}}",
M_tail=M_tail,
N=N,
result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value)
):
if settings['trace'].value:
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
for r in range(M_tail * N * (2 if is_complex else 1)):
dest.write("{result_t} result{r} = 0;",
r=r
)
dest.write("{index_type} ai=m_top*K{elt_size};")
dest.write("{index_type} bi=n_top*K{elt_size};")
if TRMM:
with dest.map(M=M_tail, N=N):
generate_trmm_block( dest )
with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ):
for ki in range( N ):
for kj in range( M_tail ):
if is_complex:
dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format(
dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2
))
dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format(
dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2
))
else:
dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format(
dest=ki*M_tail+kj, kj=kj, ki=ki
))
dest.write("ai+={M_tail}{elt_size};")
dest.write("bi+={N}{elt_size};")
dest.write("{index_type} ci=n_top*ldc+m_top;")
if is_complex:
dest.write("{result_t} Cr, Ci;")
for ki in range( N ):
for kj in range( M_tail ):
if is_complex:
if TRMM:
dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
else:
dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj)
dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj)
dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1)
dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0)
dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj )
dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj )
else:
op = '' if TRMM else '+'
dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};",
ki=ki, kj=kj, op=op, dest=ki*M_tail+kj
)
dest.write("m_top+={M_tail};")
M_tail = int(M_tail/2)
#-----------------------------------------------------------------------
class Setting(object):
def __init__( self, value, convert = None ):
self._value = value
self._convert = convert
@classmethod
def ENUM( cls, *values ):
def closure( values ):
return lambda value: values[value.lower()]
return closure( { v.lower():v for v in values } )
@classmethod
def BOOL( cls, value ):
return value.lower().startswith('t') or value == '1'
@property
def value( self ):
return self._value
@property
def configurable( self ):
return self._convert is not None
@value.setter
def value( self, value ):
self._value = self._convert( value )
def __str__( self ):
return str(self._value)
#-----------------------------------------------------------------------
def main():
settings = {
'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ),
'M': Setting( 16, int ),
'N': Setting( 4, int ),
'reg_width_bits': Setting( 256, int ),
'LMUL': Setting( 1, int ),
'M_tail_scalar_from':Setting( 2, int ),
'cpu': Setting( 'zvl256b', str ),
'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ),
'force_acc_double': Setting( False, Setting.BOOL ),
'complex': Setting( False, Setting.BOOL ),
'conjugate': Setting( False, Setting.BOOL ),
'index_type': Setting( 'BLASLONG', str ),
'trace': Setting( False, Setting.BOOL ),
'output': Setting( None, str ),
'tail_policy': Setting( '', str ), # _ta, if toolchain supports it
'__riscv_': Setting( '__riscv_', str),
}
for item in sys.argv[1:]:
try:
name, value = tuple(item.split( '=', 1 ))
except:
ERROR("couldn't parse {}, expected arguments of the form name=value".format(item))
if name not in settings:
ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name )
+"options (and current defaults) are\n{}".format(
" ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()]))
)
try:
settings[name].value = value
except:
import traceback
traceback.print_exc()
ERROR("couldn't parse {}".format(item))
if settings['output'].value is None:
if settings['complex'].value:
prefix = 'z' if settings['param_precision'].value == 'double' else 'c'
else:
prefix = 'd' if settings['param_precision'].value == 'double' else 's'
settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format(
prefix,
settings['op'],
settings['M'],
settings['N'],
settings['cpu']
))
if settings['param_precision'].value == 'double':
settings['param_scalar_t'] = Setting( 'double' )
settings['ELEN_PARAM'] = Setting(64)
else:
settings['param_scalar_t'] = Setting( 'float' )
settings['ELEN_PARAM'] = Setting(32)
settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
settings['ELEN_ACC'] = settings['ELEN_PARAM']
settings['LMUL_ACC'] = Setting(settings['LMUL'].value)
widen = ''
if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'):
settings['ELEN_ACC'] = Setting(64)
settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2)
settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']))
widen = 'w'
settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL']))
settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC']))
settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL']))
to_stdout = (settings['output'].value == '-')
if not to_stdout:
print("Writing {}".format(settings['output'].value), file=sys.stderr)
with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file:
def OUTPUT(*args, **kwargs):
print(*args, file=destination_file, **kwargs)
OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable])))
OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable])))
OUTPUT('#include "common.h"')
OUTPUT("\n")
if settings['op'].value in ('gemm', 'trmm'):
generate_gemm_kernel(settings, OUTPUT)
else:
ERROR("unsupported kernel type {}".format(settings['op']))
if __name__ == "__main__":
main()

View File

@ -27,118 +27,111 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#include <float.h>
#if defined(DOUBLE)
#define ABS fabs
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1
#define MASK_T vbool16_t
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16
#define VMFIRSTM __riscv_vfirst_m_b16
#define UINT_V_T vuint64m4_t
#define VIDV_UINT __riscv_vid_v_u64m4
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu
#define VADDVX_UINT __riscv_vadd_vx_u64m4
#define VMVVX_UINT __riscv_vmv_v_x_u64m4
#define VFABS_FLOAT __riscv_vfabs_v_f64m4
#define VCOMPRESS __riscv_vcompress_vm_u64m4
#define VMV_X __riscv_vmv_x_s_u64m4_u64
#else
#define ABS fabsf
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1
#define MASK_T vbool8_t
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8
#define VMFIRSTM __riscv_vfirst_m_b8
#define UINT_V_T vuint32m4_t
#define VIDV_UINT __riscv_vid_v_u32m4
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu
#define VADDVX_UINT __riscv_vadd_vx_u32m4
#define VMVVX_UINT __riscv_vmv_v_x_u32m4
#define VFABS_FLOAT __riscv_vfabs_v_f32m4
#define VCOMPRESS __riscv_vcompress_vm_u32m4
#define VMV_X __riscv_vmv_x_s_u32m4_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
FLOAT maxf=0.0;
BLASLONG i=0, j=0;
unsigned int max_index = 0;
if (n <= 0 || inc_x <= 0) return(max_index);
if (n <= 0 || inc_x <= 0) return(max_index);
FLOAT maxf=-FLT_MAX;
FLOAT_V_T vx, v_max;
UINT_V_T v_max_index;
MASK_T mask;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
gvl = VSETVL(n);
UINT_V_T vid = VIDV_UINT(gvl);
if(inc_x == 1){
gvl = VSETVL(n);
v_max_index = VMVVX_UINT(0, gvl);
v_max = VFMVVF_FLOAT(-1, gvl);
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
for(i=0,j=0; i < n/gvl; i++){
vx = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
vx = VFABS_FLOAT(vx, gvl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
j += gvl;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
maxf = EXTRACT_FLOAT(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
vx = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_max = VLEV_FLOAT(&x[j], gvl);
v_max = VFABS_FLOAT(v_max, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
FLOAT cur_maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
v_max_index = VADDVX_UINT(vid, j, gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
}
}
}else{
@ -146,51 +139,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
unsigned int stride_x = inc_x * sizeof(FLOAT);
unsigned int idx = 0, inc_v = gvl * inc_x;
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
v_max_index = VMVVX_UINT(0, gvl);
v_max = VFMVVF_FLOAT(-1, gvl);
for(i=0,j=0; i < n/gvl; i++){
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
vx = VFABS_FLOAT(vx, gvl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
j += gvl;
idx += inc_v;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
maxf = EXTRACT_FLOAT(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
v_max = VFABS_FLOAT(v_max, gvl);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
FLOAT cur_maxf = *((FLOAT*)&v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
v_max_index = VADDVX_UINT(vid, j, gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
}
}
}
return(max_index+1);
return(max_index+1);
}

View File

@ -31,85 +31,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#define ABS fabs
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define VLEV_FLOAT __riscv_vle64_v_f64m8
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
#define VMFIRSTM __riscv_vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
#define VIDV_UINT __riscv_vid_v_u64m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u64m8
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
#define VFABS_FLOAT __riscv_vfabs_v_f64m8
#define VCOMPRESS __riscv_vcompress_vm_u64m8
#define VMV_X __riscv_vmv_x_s_u64m8_u64
#else
#define ABS fabsf
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define VLEV_FLOAT __riscv_vle32_v_f32m8
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
#define VMFIRSTM __riscv_vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
#define VIDV_UINT __riscv_vid_v_u32m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u32m8
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
#define VFABS_FLOAT __riscv_vfabs_v_f32m8
#define VCOMPRESS __riscv_vcompress_vm_u32m8
#define VMV_X __riscv_vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
FLOAT minf=FLT_MAX;
BLASLONG i=0, j=0;
unsigned int min_index = 0;
if (n <= 0 || inc_x <= 0) return(min_index);
if (n <= 0 || inc_x <= 0) return(min_index);
FLOAT minf=FLT_MAX;
FLOAT_V_T vx, v_min;
UINT_V_T v_min_index;
MASK_T mask;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_max;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
if(inc_x == 1){
gvl = VSETVL(n);
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
v_min_index = VMVVX_UINT(0, gvl);
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
for(i=0,j=0; i < n/gvl; i++){
vx = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
vx = VFABS_FLOAT(vx, gvl);
//index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
//index where element greater than v_min
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
@ -117,29 +111,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
j += gvl;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
minf = EXTRACT_FLOAT(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
vx = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_min = VLEV_FLOAT(&x[j], gvl);
v_min = VFABS_FLOAT(v_min, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
if(cur_minf > minf){
//tail index
v_min_index = VIDV_UINT(gvl);
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
}
}
}else{
@ -151,12 +145,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v_min_index = VMVVX_UINT(0, gvl);
for(i=0,j=0; i < n/gvl; i++){
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
vx = VFABS_FLOAT(vx, gvl);
//index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
//index where element greater than v_min
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
@ -165,33 +157,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
j += gvl;
idx += inc_v;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
minf = EXTRACT_FLOAT(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(vx, 0, gvl);
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
v_min = VFABS_FLOAT(v_min, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
if(cur_minf > minf){
//tail index
v_min_index = VIDV_UINT(gvl);
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
}
}
}
return(min_index+1);
return(min_index+1);
}

View File

@ -31,68 +31,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#define ABS fabs
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define VLEV_FLOAT __riscv_vle64_v_f64m8
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
#define VMFIRSTM __riscv_vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
#define VIDV_UINT __riscv_vid_v_u64m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u64m8
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
#define VCOMPRESS __riscv_vcompress_vm_u64m8
#define VMV_X __riscv_vmv_x_s_u64m8_u64
#else
#define ABS fabsf
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define VLEV_FLOAT __riscv_vle32_v_f32m8
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
#define VMFIRSTM __riscv_vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
#define VIDV_UINT __riscv_vid_v_u32m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u32m8
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
#define VCOMPRESS __riscv_vcompress_vm_u32m8
#define VMV_X __riscv_vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
BLASLONG i=0, j=0;
unsigned int max_index = 0;
if (n <= 0 || inc_x <= 0) return(max_index);
FLOAT maxf=-FLT_MAX;
if (n <= 0 || inc_x <= 0) return(max_index);
FLOAT maxf=-FLT_MAX;
FLOAT_V_T vx, v_max;
UINT_V_T v_max_index;
MASK_T mask;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_min;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
if(inc_x == 1){
gvl = VSETVL(n);
@ -104,32 +102,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
j += gvl;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
maxf = EXTRACT_FLOAT(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
v_max = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
FLOAT cur_maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
}
}
}else{
@ -145,37 +145,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
j += gvl;
idx += inc_v;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
maxf = EXTRACT_FLOAT(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
FLOAT cur_maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
max_index = *((unsigned int*)&v_max_index+max_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
}
}
}
return(max_index+1);
return(max_index+1);
}

View File

@ -31,122 +31,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#define ABS fabs
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define VLEV_FLOAT __riscv_vle64_v_f64m8
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
#define VMFIRSTM __riscv_vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m
#define VIDV_UINT __riscv_vid_v_u64m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m
#define VADDVX_UINT __riscv_vadd_vx_u64m8
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
#define VCOMPRESS __riscv_vcompress_vm_u64m8
#define VMV_X __riscv_vmv_x_s_u64m8_u64
#else
#define ABS fabsf
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define VLEV_FLOAT __riscv_vle32_v_f32m8
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
#define VMFIRSTM __riscv_vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m
#define VIDV_UINT __riscv_vid_v_u32m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m
#define VADDVX_UINT __riscv_vadd_vx_u32m8
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
#define VCOMPRESS __riscv_vcompress_vm_u32m8
#define VMV_X __riscv_vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
FLOAT minf=FLT_MAX;
BLASLONG i=0, j=0;
unsigned int min_index = 0;
if (n <= 0 || inc_x <= 0) return(min_index);
if (n <= 0 || inc_x <= 0) return(min_index);
FLOAT minf=FLT_MAX;
FLOAT_V_T vx, v_min;
UINT_V_T v_min_index;
MASK_T mask;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_max;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
if(inc_x == 1){
gvl = VSETVL(n);
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
v_min_index = VMVVX_UINT(0, gvl);
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
for(i=0,j=0; i < n/gvl; i++){
vx = VLEV_FLOAT(&x[j], gvl);
//index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e64,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_min_index)
:"v"(mask), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e32,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_min_index)
:"v"(mask), "r"(gvl)
:"v0");
#endif
*/
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
//index where element greater than v_min
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
v_min_index = VIDV_MASK_UINT(mask, gvl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
j += gvl;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
minf = EXTRACT_FLOAT(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
v_min = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
if(cur_minf > minf){
//tail index
v_min_index = VIDV_UINT(gvl);
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
}
}
}else{
@ -159,59 +142,39 @@ asm volatile(
for(i=0,j=0; i < n/gvl; i++){
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e64,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_min_index)
:"v"(mask), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e32,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_min_index)
:"v"(mask), "r"(gvl)
:"v0");
#endif
*/
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
//index where element greater than v_min
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
v_min_index = VIDV_MASK_UINT(mask, gvl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
j += gvl;
idx += inc_v;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
minf = EXTRACT_FLOAT(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
if(cur_minf > minf){
//tail index
v_min_index = VIDV_UINT(gvl);
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
min_index = *((unsigned int*)&v_min_index+min_index);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
}
}
}
return(min_index+1);
return(min_index+1);
}

View File

@ -27,241 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#include <float.h>
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define VLEV_FLOAT __riscv_vle64_v_f64m8
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
#define VMFIRSTM __riscv_vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VSEVU_UINT vse64_v_u64m8
#define VSEVU_UINT __riscv_vse64_v_u64m8
#define UINT_T long unsigned int
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VFADDVV_FLOAT vfadd_vv_f64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
#define VIDV_UINT __riscv_vid_v_u64m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u64m8
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
#define VFABS_FLOAT __riscv_vfabs_v_f64m8
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
#define VCOMPRESS __riscv_vcompress_vm_u64m8
#define VMV_X __riscv_vmv_x_s_u64m8_u64
#else
#define ABS fabsf
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define VLEV_FLOAT __riscv_vle32_v_f32m8
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8
#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
#define VMFIRSTM __riscv_vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define UINT_T unsigned int
#define VSEVU_UINT vse32_v_u32m8
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VFADDVV_FLOAT vfadd_vv_f32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VSEVU_UINT __riscv_vse32_v_u32m8
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
#define VIDV_UINT __riscv_vid_v_u32m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u32m8
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
#define VFABS_FLOAT __riscv_vfabs_v_f32m8
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
#define VCOMPRESS __riscv_vcompress_vm_u32m8
#define VMV_X __riscv_vmv_x_s_u32m8_u32
#endif
#define RVV_M RVV_M8
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
FLOAT maxf=0.0;
BLASLONG i=0, j=0;
unsigned int max_index = 0;
if (n <= 0 || inc_x <= 0) return(max_index);
if (n <= 0 || inc_x <= 0) return(max_index);
FLOAT maxf=-FLT_MAX;
FLOAT_V_T vx0, vx1, v_max;
FLOAT_V_T vx, vx2, v_max;
UINT_V_T v_max_index;
MASK_T mask0, mask1;
MASK_T mask;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
gvl = VSETVL(n);
UINT_T temp_uint[gvl];
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
unsigned int idx = 0, inc_v = gvl * inc_x * 2;
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
v_max_index = VMVVX_UINT(0, gvl);
v_max = VFMVVF_FLOAT(-1, gvl);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG inc_xv = gvl * inc_x * 2;
BLASLONG ix = 0;
for(i=0,j=0; i < n/gvl; i++){
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
//fabs(vector)
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
//fabs(vector)
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
vx = VFABS_FLOAT(vx, gvl);
vx2 = VFABS_FLOAT(vx2, gvl);
vx = VFADDVV_FLOAT(vx, vx2, gvl);
//index where element greater than v_max
mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e64,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_max_index)
:"v"(mask0), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e32,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_max_index)
:"v"(mask0), "r"(gvl)
:"v0");
#endif
*/
v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl);
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
j += gvl;
ix += inc_xv;
idx += inc_v;
}
vx0 = VFMVVF_FLOAT(0, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
maxf = VFMVFS_FLOAT(v_res);
mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask0,gvl);
VSEVU_UINT(temp_uint,v_max_index,gvl);
max_index = temp_uint[max_index];
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
maxf = EXTRACT_FLOAT(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
v_max_index = VMVVX_UINT(0, gvl);
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
//fabs(vector)
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
//fabs(vector)
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
v_max = VFABS_FLOAT(v_max, gvl);
vx2 = VFABS_FLOAT(vx2, gvl);
v_max = VFADDVV_FLOAT(v_max, vx2, gvl);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask0,gvl);
VSEVU_UINT(temp_uint,v_max_index,gvl);
max_index = temp_uint[max_index];
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
UINT_V_T compressed;
compressed = VCOMPRESS(v_max_index, mask, gvl);
max_index = VMV_X(compressed);
}
}
return(max_index+1);
return(max_index+1);
}

View File

@ -31,235 +31,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define VLEV_FLOAT __riscv_vle64_v_f64m8
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
#define VMFIRSTM __riscv_vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VSEVU_UINT vse64_v_u64m8
#define UINT_T long unsigned int
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VFADDVV_FLOAT vfadd_vv_f64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
#define VIDV_UINT __riscv_vid_v_u64m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u64m8
#define VMVVX_UINT __riscv_vmv_v_x_u64m8
#define VFABS_FLOAT __riscv_vfabs_v_f64m8
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
#define VCOMPRESS __riscv_vcompress_vm_u64m8
#define VMV_X __riscv_vmv_x_s_u64m8_u64
#else
#define ABS fabsf
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define VLEV_FLOAT __riscv_vle32_v_f32m8
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
#define VMFIRSTM __riscv_vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define UINT_T unsigned int
#define VSEVU_UINT vse32_v_u32m8
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VFADDVV_FLOAT vfadd_vv_f32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VSEVU_UINT __riscv_vse32_v_u32m8
#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
#define VIDV_UINT __riscv_vid_v_u32m8
#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
#define VADDVX_UINT __riscv_vadd_vx_u32m8
#define VMVVX_UINT __riscv_vmv_v_x_u32m8
#define VFABS_FLOAT __riscv_vfabs_v_f32m8
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
#define VCOMPRESS __riscv_vcompress_vm_u32m8
#define VMV_X __riscv_vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
FLOAT minf=FLT_MAX;
BLASLONG i=0, j=0;
unsigned int min_index = 0;
if (n <= 0 || inc_x <= 0) return(min_index);
if (n <= 0 || inc_x <= 0) return(min_index);
FLOAT minf=FLT_MAX;
FLOAT_V_T vx0, vx1, v_min;
FLOAT_V_T vx, vx2, v_min;
UINT_V_T v_min_index;
MASK_T mask0, mask1;
MASK_T mask;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_max;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
gvl = VSETVL(n);
UINT_T temp_uint[gvl];
v_min_index = VMVVX_UINT(0, gvl);
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG inc_xv = gvl * inc_x * 2;
BLASLONG ix = 0;
for(i=0,j=0; i < n/gvl; i++){
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
//fabs(vector)
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
//fabs(vector)
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
unsigned int idx = 0, inc_v = gvl * inc_x * 2;
//index where element less than v_min
mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e64,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_min_index)
:"v"(mask0), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1 \n\t"
"vsetvli x0, %2, e32,m8 \n\t"
"vid.v %0, v0.t \n\t"
:"+v"(v_min_index)
:"v"(mask0), "r"(gvl)
:"v0");
#endif
*/
v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl);
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
v_min_index = VMVVX_UINT(0, gvl);
for(i=0,j=0; i < n/gvl; i++){
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
vx = VFABS_FLOAT(vx, gvl);
vx2 = VFABS_FLOAT(vx2, gvl);
vx = VFADDVV_FLOAT(vx, vx2, gvl);
//index where element greater than v_min
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
j += gvl;
ix += inc_xv;
idx += inc_v;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = VFMVFS_FLOAT(v_res);
mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask0,gvl);
VSEVU_UINT(temp_uint,v_min_index,gvl);
min_index = temp_uint[min_index];
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
minf = EXTRACT_FLOAT(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
if(j < n){
gvl = VSETVL(n-j);
v_min_index = VMVVX_UINT(0, gvl);
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
//fabs(vector)
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
//fabs(vector)
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
/*
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(vx1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
*/
v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
FLOAT cur_minf = VFMVFS_FLOAT(v_res);
if(cur_minf < minf){
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
v_min = VFABS_FLOAT(v_min, gvl);
vx2 = VFABS_FLOAT(vx2, gvl);
v_min = VFADDVV_FLOAT(v_min, vx2, gvl);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
if(cur_minf > minf){
//tail index
v_min_index = VIDV_UINT(gvl);
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask0,gvl);
VSEVU_UINT(temp_uint,v_min_index,gvl);
min_index = temp_uint[min_index];
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
UINT_V_T compressed;
compressed = VCOMPRESS(v_min_index, mask, gvl);
min_index = VMV_X(compressed);
}
}
return(min_index+1);
return(min_index+1);
}

View File

@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 32
# else
# define ELEN 32
# define MLEN 16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMAXVV_FLOAT vfmax_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 8
# else
# define ELEN 32
# define MLEN 4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT maxf=-FLT_MAX;
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_max;
FLOAT_V_T_M1 v_res, v_min;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
if(inc_x == 1){
gvl = VSETVL(n);
@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
j += gvl * 2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
if(*((FLOAT*)&v_res) > maxf)
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
j += gvl * 2;
idx += inc_xv * 2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
if(*((FLOAT*)&v_res) > maxf)
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}
maxf = EXTRACT_FLOAT(v_res);
return(maxf);
}

View File

@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMINVV_FLOAT vfmin_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 32
# else
# define ELEN 32
# define MLEN 16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMINVV_FLOAT vfmin_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 8
# else
# define ELEN 32
# define MLEN 4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT minf=FLT_MAX;
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_min;
FLOAT_V_T_M1 v_res, v_max;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
if(inc_x == 1){
gvl = VSETVL(n);
@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
j += gvl * 2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
if(*((FLOAT*)&v_res) < minf)
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
j += gvl * 2;
idx += inc_xv * 2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
if(*((FLOAT*)&v_res) < minf)
minf = *((FLOAT*)&v_res);
v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}
minf = EXTRACT_FLOAT(v_res);
return(minf);
}

View File

@ -26,207 +26,185 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFDOTVV_FLOAT vfdot_vv_f32m4
#define ABS fabsf
#define MASK_T vbool8_t
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
#define VMFIRSTM vmfirst_m_b8
#define VFDIVVF_FLOAT vfdiv_vf_f32m4
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
#ifdef RISCV64_ZVL256B
# define LMUL m1
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 64
# else
# define ELEN 32
# define MLEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFDOTVV_FLOAT vfdot_vv_f64m4
#define ABS fabs
#define MASK_T vbool16_t
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
#define VMFIRSTM vmfirst_m_b16
#define VFDIVVF_FLOAT vfdiv_vf_f64m4
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
# define LMUL m4
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 16
# else
# define ELEN 32
# define MLEN 8
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVSF_FLOAT JOIN(__riscv_vfmv, _s_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
#define VFABS JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _)
#define VMFNE JOIN(__riscv_vmfne_vf_f,ELEN, LMUL, _b, MLEN)
#define VMFGT JOIN(__riscv_vmfgt_vv_f,ELEN, LMUL, _b, MLEN)
#define VMFEQ JOIN(__riscv_vmfeq_vf_f,ELEN, LMUL, _b, MLEN)
#define VCPOP JOIN(__riscv_vcpop, _m_b, MLEN, _, _)
#define VFREDMAX JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
#define VFREDMIN JOIN(__riscv_vfredmin_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
#define VFIRST JOIN(__riscv_vfirst, _m_b, MLEN, _, _)
#define VRGATHER JOIN(__riscv_vrgather, _vx_f, ELEN, LMUL, _)
#define VFDIV JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _)
#define VFDIV_M JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _mu)
#define VFMUL JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _)
#define VFMUL_M JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _mu)
#define VFMACC JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _)
#define VFMACC_M JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _mu)
#define VMSBF JOIN(__riscv_vmsbf, _m_b, MLEN, _, _)
#define VMSOF JOIN(__riscv_vmsof, _m_b, MLEN, _, _)
#define VMAND JOIN(__riscv_vmand, _mm_b, MLEN, _, _)
#define VMANDN JOIN(__riscv_vmandn, _mm_b, MLEN, _, _)
#define VFREDSUM JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
#define VMERGE JOIN(__riscv_vmerge, _vvm_f, ELEN, LMUL, _)
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL)
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
//#define DUMP( label, v0, gvl )
#define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
BLASLONG i=0;
if ( n < 0 ) return(0.0);
if(n <= 0) return(0.0);
if(n == 1) return (ABS(x[0]));
FLOAT_V_T vr, v0, v_zero;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
FLOAT scale = 0.0, ssq = 0.0;
MASK_T mask;
BLASLONG index = 0;
if(inc_x == 1){
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
v_zero = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n/gvl; i++){
v0 = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0){
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
}
}else{//found greater element
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
j += gvl;
}
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
MASK_T nonzero_mask;
MASK_T scale_mask;
//tail
if(j < n){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0)
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}else{//found greater element
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
}
}else{
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
v_zero = VFMVVF_FLOAT(0, gvl);
unsigned int stride_x = inc_x * sizeof(FLOAT);
int idx = 0, inc_v = inc_x * gvl;
for(i=0,j=0; i<n/gvl; i++){
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0){
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
}
}else{//found greater element
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
j += gvl;
idx += inc_v;
}
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
gvl = VSETVL(n);
FLOAT_V_T v0;
FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
//tail
if(j < n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0)
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}else{//found greater element
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOATM4(vr);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
FLOAT scale = 0;
FLOAT ssq = 0;
unsigned int stride_x = inc_x * sizeof(FLOAT);
int idx = 0;
if( n >= gvl ) // don't pay overheads if we're not doing useful work
{
for(i=0; i<n/gvl; i++){
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
nonzero_mask = VMFNE( v0, 0, gvl );
v0 = VFABS( v0, gvl );
scale_mask = VMFGT( v0, v_scale, gvl );
// assume scale changes are relatively infrequent
// unclear if the vcpop+branch is actually a win
// since the operations being skipped are predicated anyway
// need profiling to confirm
if( VCPOP(scale_mask, gvl) )
{
v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
}
v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
idx += inc_x * gvl;
}
// we have gvl elements which we accumulated independently, with independent scales
// we need to combine these
// naive sort so we process small values first to avoid losing information
// could use vector sort extensions where available, but we're dealing with gvl elts at most
FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
VSEV_FLOAT( out_ssq, v_ssq, gvl );
VSEV_FLOAT( out_scale, v_scale, gvl );
for( int a = 0; a < (gvl-1); ++a )
{
int smallest = a;
for( size_t b = a+1; b < gvl; ++b )
if( out_scale[b] < out_scale[smallest] )
smallest = b;
if( smallest != a )
{
FLOAT tmp1 = out_ssq[a];
FLOAT tmp2 = out_scale[a];
out_ssq[a] = out_ssq[smallest];
out_scale[a] = out_scale[smallest];
out_ssq[smallest] = tmp1;
out_scale[smallest] = tmp2;
}
}
int a = 0;
while( a<gvl && out_scale[a] == 0 )
++a;
if( a < gvl )
{
ssq = out_ssq[a];
scale = out_scale[a];
++a;
for( ; a < gvl; ++a )
{
ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
scale = out_scale[a];
}
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
}
}
//finish any tail using scalar ops
i*=gvl*inc_x;
n*=inc_x;
while(i < n){
if ( x[i] != 0.0 ){
FLOAT absxi = ABS( x[i] );
if ( scale < absxi ){
ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
scale = absxi ;
}
else{
ssq += ( absxi/scale ) * ( absxi/scale );
}
}
i += inc_x;
}
return(scale * sqrt(ssq));
}

View File

@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m8
#define VLSEV_FLOAT vlse_v_f32m8
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m8
#define VLSEV_FLOAT vlse_v_f64m8
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8

View File

@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFMSACVF_FLOAT vfmsac_vf_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSEV_FLOAT __riscv_vse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFMSACVF_FLOAT vfmsac_vf_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSEV_FLOAT __riscv_vse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG ix=0,iy=0;
if(n <= 0) return(0);
unsigned int gvl = 0;
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
FLOAT_V_T v0, v1, vx, vy;
if(inc_x == 1 && inc_y == 1){
gvl = VSETVL(n);
for(i=0,j=0; i<n/gvl; i++){
vx = VLEV_FLOAT(&x[j], gvl);
vy = VLEV_FLOAT(&y[j], gvl);
@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
VSEV_FLOAT(&y[j], v1, gvl);
}
}else if(inc_y == 1){
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * gvl;
for(i=0,j=0; i<n/gvl; i++){
@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
VSEV_FLOAT(&y[j], v1, gvl);
}
}else if(inc_x == 1){
gvl = VSETVL(n);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
BLASLONG inc_yv = inc_y * gvl;
for(i=0,j=0; i<n/gvl; i++){
@ -154,7 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
}
}else{
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * gvl;

View File

@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMULVF_FLOAT vfmul_vf_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 32
# else
# define ELEN 32
# define MLEN 16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMULVF_FLOAT vfmul_vf_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 8
# else
# define ELEN 32
# define MLEN 4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL)
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMULVF_FLOAT JOIN(__riscv_vfmul, _vf_f, ELEN, LMUL, _)
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
}else{
if(da == 0.0){
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG ix = 0;
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG ix = 0;
if(gvl <= n / 2){
long int inc_xv = gvl * inc_x;
v0 = VFMVVF_FLOAT(0, gvl);
for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl);
ix += inc_xv * 2;
}
v0 = VFMVVF_FLOAT(0, gvl);
for(i = 0; i < n/(gvl*2); ++i ){
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
ix += inc_x * gvl;
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
ix += inc_x * gvl;
}
//tail
for(; j <n; ){
gvl = VSETVL(n-j);
i *= gvl*2;
while( i < n ){
gvl = VSETVL(n-i);
v0 = VFMVVF_FLOAT(0, gvl);
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
j += gvl;
ix += inc_x * gvl;
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
i += gvl;
ix += inc_x * gvl;
}
}else{
gvl = VSETVL(n);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

114
kernel/riscv64/sum_vector.c Normal file
View File

@ -0,0 +1,114 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if !defined(DOUBLE)
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT __riscv_vle32_v_f32m8
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
#else
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT __riscv_vle64_v_f64m8
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
BLASLONG ix=0;
FLOAT asumf=0.0;
if (n <= 0 || inc_x <= 0) return(asumf);
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_sum;
FLOAT_V_T_M1 v_res;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
if(inc_x == 1){
gvl = VSETVL(n);
if(gvl <= n/2){
v_sum = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLEV_FLOAT(&x[j], gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl * 2;
}
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
gvl = VSETVL(n);
unsigned int stride_x = inc_x * sizeof(FLOAT);
if(gvl <= n/2){
v_sum = VFMVVF_FLOAT(0, gvl);
BLASLONG inc_xv = inc_x * gvl;
for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl * 2;
inc_xv += inc_xv * 2;
}
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}
asumf = EXTRACT_FLOAT(v_res);
return(asumf);
}

View File

@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <stdio.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 32
# else
# define ELEN 32
# define MLEN 16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 8
# else
# define ELEN 32
# define MLEN 4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL)
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL)
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, j = 0;
BLASLONG ix = 0,iy = 0;
BLASLONG stride_x, stride_y;
FLOAT_V_T vx0, vx1, vy0, vy1;
unsigned int gvl = 0;
if (n < 0) return(0);
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
if(inc_x == 1 && inc_y == 1){
gvl = VSETVL(n);
if(gvl <= n/2){
for(i=0,j=0; i<n/(2*gvl); i++){
vx0 = VLEV_FLOAT(&x[j], gvl);
@ -79,7 +96,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
j+=gvl;
}
}else if (inc_y == 1){
gvl = VSETVL(n);
stride_x = inc_x * sizeof(FLOAT);
if(gvl <= n/2){
BLASLONG inc_xv = inc_x * gvl;
@ -107,7 +123,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
ix += inc_x * gvl;
}
}else if(inc_x == 1){
gvl = VSETVL(n);
stride_y = inc_y * sizeof(FLOAT);
if(gvl <= n/2){
BLASLONG inc_yv = inc_y * gvl;
@ -135,7 +150,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
iy += inc_y * gvl;
}
}else{
gvl = VSETVL(n);
stride_x = inc_x * sizeof(FLOAT);
stride_y = inc_y * sizeof(FLOAT);
if(gvl <= n/2){

View File

@ -27,37 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSEV_FLOAT __riscv_vse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSEV_FLOAT __riscv_vse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@ -99,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
i += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLEV_FLOAT(&y[i], gvl);
@ -110,8 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[j] += alpha * temp2;
@ -144,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
i += gvl;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@ -155,8 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[jy] += alpha * temp2;
@ -190,8 +188,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
i += gvl;
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLEV_FLOAT(&y[i], gvl);
@ -201,8 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[j] += alpha * temp2;
@ -241,8 +239,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
ix += inc_xv;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@ -252,8 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[jy] += alpha * temp2;

View File

@ -27,39 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFDOTVV_FLOAT vfdot_vv_f32m4
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSEV_FLOAT __riscv_vse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFDOTVV_FLOAT vfdot_vv_f64m4
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSEV_FLOAT __riscv_vse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@ -101,8 +99,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
i += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLEV_FLOAT(&y[i], gvl);
@ -112,8 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[j] += temp1 * a_ptr[j] + alpha * temp2;
@ -145,8 +143,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
i += gvl;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@ -156,8 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[jy] += temp1 * a_ptr[j] + alpha * temp2;
@ -190,8 +188,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
i += gvl;
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLEV_FLOAT(&y[i], gvl);
@ -201,8 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[j] += temp1 * a_ptr[j] + alpha * temp2;
@ -240,8 +238,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
ix += inc_xv;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 = EXTRACT_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@ -251,8 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
temp2 += EXTRACT_FLOAT(v_res);
}
}
y[jy] += temp1 * a_ptr[j] + alpha * temp2;

View File

@ -28,40 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VFADDVV_FLOAT vfadd_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 32
# else
# define ELEN 32
# define MLEN 16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VFADDVV_FLOAT vfadd_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 8
# else
# define ELEN 32
# define MLEN 4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m)
#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _)
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@ -70,10 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return(maxf);
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_max;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, 1);
MASK_T mask0, mask1;
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@ -84,9 +87,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
v0 = VFADDVV_FLOAT(v0, v1, gvl);
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
@ -94,22 +97,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
j += gvl;
ix += inc_xv;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
maxf = VFMVFS_FLOAT(v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
if(j<n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
v1 = VFADDVV_FLOAT(v0, v1, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
if(VFMVFS_FLOAT(v_res)> maxf)
maxf = VFMVFS_FLOAT(v_res);
v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
}
maxf = EXTRACT_FLOAT(v_res);
return(maxf);
}

View File

@ -29,38 +29,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <math.h>
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VFADDVV_FLOAT vfadd_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 32
# else
# define ELEN 32
# define MLEN 16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VFADDVV_FLOAT vfadd_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 8
# else
# define ELEN 32
# define MLEN 4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m)
#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _)
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@ -69,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT minf=FLT_MAX;
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_min;
FLOAT_V_T_M1 v_res, v_max;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
MASK_T mask0, mask1;
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@ -83,9 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
v0 = VFADDVV_FLOAT(v0, v1, gvl);
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
@ -93,21 +99,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
j += gvl;
ix += inc_xv;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
minf = VFMVFS_FLOAT(v_res);
v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
if(j<n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
v1 = VFADDVV_FLOAT(v0, v1, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
if(VFMVFS_FLOAT(v_res) < minf)
minf = VFMVFS_FLOAT(v_res);
v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
}
minf = EXTRACT_FLOAT(v_res);
return(minf);
}

View File

@ -28,37 +28,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFADDVV_FLOAT vfadd_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN _b32
# else
# define ELEN 32
# define MLEN _b16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFADDVV_FLOAT vfadd_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN _b8
# else
# define ELEN 32
# define MLEN _b4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _)
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@ -67,12 +73,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return(asumf);
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_zero,v_sum;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, 1);
MASK_T mask0, mask1;
if(inc_x == 1){
BLASLONG n2 = n * 2;
gvl = VSETVL(n2);
@ -81,26 +84,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v_sum = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n2/(gvl*2); i++){
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl * 2;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
asumf += VFFMVFS_FLOAT(v_res);
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
for(;j<n2;){
gvl = VSETVL(n2-j);
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
asumf += VFFMVFS_FLOAT(v_res);
v0 = VFABS_FLOAT(v0, gvl);
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
@ -112,34 +110,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v_sum = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n/gvl; i++){
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v0 = VFABS_FLOAT(v0, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl;
ix += inc_xv;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
asumf += VFFMVFS_FLOAT(v_res);
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
if(j<n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
v0 = VFABS_FLOAT(v0, gvl);
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
v1 = VFABS_FLOAT(v1, gvl);
v_sum = VFADDVV_FLOAT(v0, v1, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
asumf += VFFMVFS_FLOAT(v_res);
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
}
asumf = EXTRACT_FLOAT(v_res);
return(asumf);
}

View File

@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFMSACVF_FLOAT vfmsac_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFMSACVF_FLOAT vfmsac_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
#endif
int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)

View File

@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)

View File

@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#endif

View File

@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFDOTVV_FLOAT vfdot_vv_f32m4
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VFMSACVV_FLOAT vfmsac_vv_f32m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFDOTVV_FLOAT vfdot_vv_f64m4
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VFMSACVV_FLOAT vfmsac_vv_f64m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@ -109,9 +109,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
ix += inc_xv;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
dot[0] += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
dot[1] += VFMVFS_FLOAT(v_res);
//tail
if(j < n){
@ -132,9 +132,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
dot[0] += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
dot[1] += VFMVFS_FLOAT(v_res);
}
CREAL(result) = dot[0];

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,140 @@
#include "common.h"
/* for debugging/unit tests
* this is a drop-in replacement for zgemm/cgemm/ztrmm/ctrmm kernels that supports arbitrary combinations of unroll values
*/
#ifdef TRMMKERNEL
#if defined(LEFT) != defined(TRANSA)
#define BACKWARDS
#endif
#endif
#ifdef DOUBLE
#define UNROLL_M ZGEMM_DEFAULT_UNROLL_M
#define UNROLL_N ZGEMM_DEFAULT_UNROLL_N
#else
#define UNROLL_M CGEMM_DEFAULT_UNROLL_M
#define UNROLL_N CGEMM_DEFAULT_UNROLL_N
#endif
int CNAME(BLASLONG M,BLASLONG N,BLASLONG K,FLOAT alphar,FLOAT alphai,FLOAT* A,FLOAT* B,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
FLOAT res[UNROLL_M*UNROLL_N*2];
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
FLOAT sign[4] = { 1, -1, 1, 1};
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
FLOAT sign[4] = { 1, 1, 1, -1};
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
FLOAT sign[4] = { 1, 1, -1, 1};
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
FLOAT sign[4] = { 1, -1, -1, -1};
#endif
BLASLONG n_packing = UNROLL_N;
BLASLONG n_top = 0;
while(n_top < N)
{
while( n_top+n_packing > N )
n_packing >>= 1;
BLASLONG m_packing = UNROLL_M;
BLASLONG m_top = 0;
while (m_top < M)
{
while( m_top+m_packing > M )
m_packing >>= 1;
BLASLONG ai = K*m_top*2;
BLASLONG bi = K*n_top*2;
BLASLONG pass_K = K;
#ifdef TRMMKERNEL
#ifdef LEFT
BLASLONG off = offset + m_top;
#else
BLASLONG off = -offset + n_top;
#endif
#ifdef BACKWARDS
ai += off * m_packing*2;
bi += off * n_packing*2;
pass_K -= off;
#else
#ifdef LEFT
pass_K = off + m_packing;
#else
pass_K = off + n_packing;
#endif
#endif
#endif
memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) );
for (BLASLONG k=0; k<pass_K; k+=1)
{
for( BLASLONG ki = 0; ki < n_packing; ++ki )
{
FLOAT B0 = B[bi+ki*2+0];
FLOAT B1 = B[bi+ki*2+1];
for( BLASLONG kj = 0; kj < m_packing; ++kj )
{
FLOAT A0 = A[ai+kj*2+0];
FLOAT A1 = A[ai+kj*2+1];
res[(ki*UNROLL_M+kj)*2+0] += sign[0]*A0*B0 +sign[1]*A1*B1;
res[(ki*UNROLL_M+kj)*2+1] += sign[2]*A1*B0 +sign[3]*A0*B1;
}
}
ai += m_packing*2;
bi += n_packing*2;
}
BLASLONG cofs = ldc * n_top + m_top;
for( BLASLONG ki = 0; ki < n_packing; ++ki )
{
for( BLASLONG kj = 0; kj < m_packing; ++kj )
{
#ifdef TRMMKERNEL
FLOAT Cr = 0;
FLOAT Ci = 0;
#else
FLOAT Cr = C[(cofs+ki*ldc+kj)*2+0];
FLOAT Ci = C[(cofs+ki*ldc+kj)*2+1];
#endif
Cr += res[(ki*UNROLL_M+kj)*2+0]*alphar;
Cr += -res[(ki*UNROLL_M+kj)*2+1]*alphai;
Ci += res[(ki*UNROLL_M+kj)*2+1]*alphar;
Ci += res[(ki*UNROLL_M+kj)*2+0]*alphai;
C[(cofs+ki*ldc+kj)*2+0] = Cr;
C[(cofs+ki*ldc+kj)*2+1] = Ci;
}
}
m_top += m_packing;
}
n_top += n_packing;
}
return 0;
}

View File

@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSEV_FLOAT __riscv_vse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSEV_FLOAT __riscv_vse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)

View File

@ -27,31 +27,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m2_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m2_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@ -62,49 +62,43 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT temp_r, temp_i;
FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
unsigned int gvl = VSETVL(m);
FLOAT_V_T_M1 v_res_r, v_res_i;
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
BLASLONG stride_a = sizeof(FLOAT) * 2;
gvl = VSETVL(m);
BLASLONG inc_xv = inc_x * gvl * 2;
BLASLONG inc_av = gvl * 2;
BLASLONG inc_y2 = inc_y * 2;
BLASLONG lda2 = lda * 2;
for(i = 0; i < n; i++){
v_res_r = VFMVVF_FLOAT_M1(0, 1);
v_res_i = VFMVVF_FLOAT_M1(0, 1);
gvl = VSETVL(m);
j = 0;
ix = 0;
vr = VFMVVF_FLOAT(0, gvl);
vi = VFMVVF_FLOAT(0, gvl);
for(k = 0; k < m/gvl; k++){
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
vr = VFMULVV_FLOAT(va0, vx0, gvl);
vi = VFMULVV_FLOAT(va0, vx1, gvl);
vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
#else
vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
vr = VFMULVV_FLOAT(va0, vx0, gvl);
vi = VFMULVV_FLOAT(va0, vx1, gvl);
vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
#endif
v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl);
v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl);
j += inc_av;
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp_r = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
temp_i = VFMVFS_FLOAT(v_res);
if(j/2 < m){
gvl = VSETVL(m-j/2);
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
@ -113,21 +107,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vr = VFMULVV_FLOAT(va0, vx0, gvl);
vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
vi = VFMULVV_FLOAT(va0, vx1, gvl);
vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
#else
vr = VFMULVV_FLOAT(va0, vx0, gvl);
vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
vi = VFMULVV_FLOAT(va0, vx1, gvl);
vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
temp_r += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
temp_i += VFMVFS_FLOAT(v_res);
v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl);
v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl);
}
temp_r = VFMVFS_FLOAT(v_res_r);
temp_i = VFMVFS_FLOAT(v_res_i);
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;

View File

@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@ -143,9 +143,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
iy += inc_yv;
ia += inc_av;
}
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
temp_r2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
temp_i2 = VFMVFS_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
@ -181,9 +181,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
temp_r2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
temp_i2 += VFMVFS_FLOAT(v_res);
}
}

View File

@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@ -142,9 +142,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
iy += inc_yv;
ia += inc_av;
}
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
temp_r2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
temp_i2 = VFMVFS_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
@ -180,9 +180,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
temp_r2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
temp_i2 += VFMVFS_FLOAT(v_res);
}
}

View File

@ -26,264 +26,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFDOTVV_FLOAT vfdot_vv_f32m4
#define ABS fabsf
#define MASK_T vbool8_t
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
#define VMFIRSTM vmfirst_m_b8
#define VFDIVVF_FLOAT vfdiv_vf_f32m4
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
#ifdef RISCV64_ZVL256B
# define LMUL m1
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 64
# else
# define ELEN 32
# define MLEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFDOTVV_FLOAT vfdot_vv_f64m4
#define ABS fabs
#define MASK_T vbool16_t
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
#define VMFIRSTM vmfirst_m_b16
#define VFDIVVF_FLOAT vfdiv_vf_f64m4
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
# define LMUL m4
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 16
# else
# define ELEN 32
# define MLEN 8
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
#define VFABS JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _)
#define VMFNE JOIN(__riscv_vmfne_vf_f,ELEN, LMUL, _b, MLEN)
#define VMFGT JOIN(__riscv_vmfgt_vv_f,ELEN, LMUL, _b, MLEN)
#define VMFEQ JOIN(__riscv_vmfeq_vv_f,ELEN, LMUL, _b, MLEN)
#define VCPOP JOIN(__riscv_vcpop, _m_b, MLEN, _, _)
#define VFREDMAX JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
#define VFIRST JOIN(__riscv_vfirst, _m_b, MLEN, _, _)
#define VRGATHER JOIN(__riscv_vrgather, _vx_f, ELEN, LMUL, _)
#define VFDIV JOIN(__riscv_vfdiv, _vf_f, ELEN, LMUL, _)
#define VFDIV_M JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _mu)
#define VFMUL JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _)
#define VFMACC JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _)
#define VFMACC_M JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _mu)
#define VMSOF JOIN(__riscv_vmsof, _m_b, MLEN, _, _)
#define VMANDN JOIN(__riscv_vmandn, _mm_b, MLEN, _, _)
#define VFREDUSUM JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
BLASLONG i=0;
if ( n < 0 ) return(0.0);
// if(n == 1) return (ABS(x[0]));
if(n < 0) return(0.0);
FLOAT_V_T vr, v0, v_zero;
FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero;
unsigned int gvl = 0;
FLOAT_V_T_M1 v_res, v_z0;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
FLOAT scale = 0.0, ssq = 0.0;
MASK_T mask;
BLASLONG index = 0;
if(inc_x == 1){
BLASLONG n2 = n * 2;
gvl = VSETVL(n2);
vr = VFMVVF_FLOAT(0, gvl);
v_zero = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n2/gvl; i++){
v0 = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0){
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
}
}else{//found greater element
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
j += gvl;
v_res = VFMVVF_FLOAT_M1(0, 1);
v_z0 = VFMVVF_FLOAT_M1(0, 1);
gvl = VSETVL(n);
v_ssq = VFMVVF_FLOAT(0, gvl);
v_scale = VFMVVF_FLOAT(0, gvl);
v_zero = VFMVVF_FLOAT(0, gvl);
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
int idx = 0;
for(i=0; i<n/gvl; i++){
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
v1 = VLSEV_FLOAT( &x[idx+1], stride_x, gvl );
v0 = VFABS( v0, gvl );
v1 = VFABS( v1, gvl );
MASK_T scale_mask0 = VMFGT( v0, v_scale, gvl );
MASK_T scale_mask1 = VMFGT( v1, v_scale, gvl );
if( VCPOP( scale_mask0, gvl ) + VCPOP( scale_mask1, gvl ) > 0 ){ // scale change?
// find largest element in v0 and v1
v_res = VFREDMAX( v0, v_z0, gvl );
v_res = VFREDMAX( v1, v_res, gvl );
FLOAT const largest_elt = EXTRACT_FLOAT( v_res );
v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt
v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt)
v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt)
v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale
}
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
//tail
if(j < n2){
gvl = VSETVL(n2-j);
v0 = VLEV_FLOAT(&x[j], gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0)
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}else{//found greater element
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
}
}else{
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
v_zero = VFMVVF_FLOAT(0, gvl);
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
int idx = 0, inc_v = inc_x * gvl * 2;
for(i=0,j=0; i<n/gvl; i++){
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0){
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
}
}else{//found greater element
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl );
MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl );
v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl );
v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl );
v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl );
v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl );
v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0){
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
}
}else{//found greater element
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
j += gvl;
idx += inc_v;
}
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
//tail
if(j < n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0){
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
}else{//found greater element
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
//fabs(vector)
mask = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
//if scale change
mask = VMFGTVF_FLOAT(v0, scale, gvl);
index = VMFIRSTM(mask, gvl);
if(index == -1){//no elements greater than scale
if(scale != 0.0){
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
}
}else{//found greater element
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
ssq += VFMVFS_FLOAT(v_res);
}
idx += inc_x * gvl * 2;
}
return(scale * sqrt(ssq));
v_res = VFREDUSUM(v_ssq, v_z0, gvl);
FLOAT ssq = EXTRACT_FLOAT(v_res);
FLOAT scale = EXTRACT_FLOAT0_V(v_scale);
//finish any tail using scalar ops
i*=gvl;
if(i<n){
i *= inc_x*2;
n *= inc_x*2;
FLOAT temp;
do{
if ( x[i] != 0.0 ){
temp = ABS( x[i] );
if ( scale < temp ){
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
scale = temp ;
}else{
ssq += ( temp / scale ) * ( temp / scale );
}
}
if ( x[i+1] != 0.0 ){
temp = ABS( x[i+1] );
if ( scale < temp ){
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
scale = temp ;
}else{
ssq += ( temp / scale ) * ( temp / scale );
}
}
i += inc_x*2;
}while(i<n);
}
return(scale * sqrt(ssq));
}

View File

@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSEV_FLOAT __riscv_vse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSEV_FLOAT __riscv_vse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
unsigned int gvl = 0;
FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
gvl = VSETVL(n);
gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * 2 * gvl;

View File

@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define VSETVL(n) __riscv_vsetvl_e32m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define VSETVL(n) __riscv_vsetvl_e64m4(n)
#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)

View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN _b32
# else
# define ELEN 32
# define MLEN _b16
# endif
#else
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN _b8
# else
# define ELEN 32
# define MLEN _b4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _)
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
BLASLONG ix=0;
FLOAT asumf=0.0;
if (n <= 0 || inc_x <= 0) return(asumf);
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_zero,v_sum;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, 1);
if(inc_x == 1){
BLASLONG n2 = n * 2;
gvl = VSETVL(n2);
v_zero = VFMVVF_FLOAT(0, gvl);
if(gvl <= n2/2){
v_sum = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n2/(gvl*2); i++){
v0 = VLEV_FLOAT(&x[j], gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl * 2;
}
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
for(;j<n2;){
gvl = VSETVL(n2-j);
v0 = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
gvl = VSETVL(n);
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
v_zero = VFMVVF_FLOAT(0, gvl);
BLASLONG inc_xv = inc_x * 2 * gvl;
v_sum = VFMVVF_FLOAT(0, gvl);
for(i=0,j=0; i<n/gvl; i++){
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
j += gvl;
ix += inc_xv;
}
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
if(j<n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
v_sum = VFADDVV_FLOAT(v0, v1, gvl);
v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
}
}
asumf = EXTRACT_FLOAT(v_res);
return(asumf);
}

View File

@ -27,35 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <stdio.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 64
# else
# define ELEN 32
# define MLEN 32
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 16
# else
# define ELEN 32
# define MLEN 8
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL)
#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL)
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, j = 0;
BLASLONG ix = 0,iy = 0;
BLASLONG stride_x, stride_y;
FLOAT_V_T vx0, vx1, vy0, vy1;
unsigned int gvl = 0;
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
if (n < 0) return(0);
if(inc_x == 1 && inc_y == 1){
gvl = VSETVL(n);
BLASLONG n2 = n * 2;
if(gvl <= n2/2){
for(i=0,j=0; i<n2/(2*gvl); i++){
@ -80,7 +95,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
j += gvl;
}
}else{
gvl = VSETVL(n);
stride_x = inc_x * 2 * sizeof(FLOAT);
stride_y = inc_y * 2 * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * gvl * 2;

File diff suppressed because it is too large Load Diff

39
param.h
View File

@ -3121,6 +3121,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef RISCV64_ZVL256B
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 8
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 64
#define CGEMM_DEFAULT_P 64
#define ZGEMM_DEFAULT_P 64
#define SGEMM_DEFAULT_Q 128
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 64
#define SGEMM_DEFAULT_R 16384
#define DGEMM_DEFAULT_R 8192
#define CGEMM_DEFAULT_R 8192
#define ZGEMM_DEFAULT_R 4096
#define SYMV_P 16
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#endif
#ifdef ARMV7
#define SNUMOPT 2
#define DNUMOPT 2