* update intrinsics to match latest spec at https://github.com/riscv-non-isa/rvv-intrinsic-doc (in particular, __riscv_ prefixes for rvv intrinsics)

* fix multiple numerical stability and corner case issues
* add a script to generate arbitrary gemm kernel shapes
* add a generic zvl256b target to demonstrate large gemm kernel unrolls
This commit is contained in:
Sergei Lewis
2023-02-24 10:44:55 +00:00
parent c19dff0a31
commit 2406958629
58 changed files with 18648 additions and 2388 deletions

View File

@@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#ifdef RISCV64_ZVL256B
# define LMUL m2
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 32
# else
# define ELEN 32
# define MLEN 16
# endif
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMAXVV_FLOAT vfmax_vv_f64m8
# define LMUL m8
# if defined(DOUBLE)
# define ELEN 64
# define MLEN 8
# else
# define ELEN 32
# define MLEN 4
# endif
#endif
#define _
#define JOIN2_X(x, y) x ## y
#define JOIN2(x, y) JOIN2_X(x, y)
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL)
#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN)
#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _)
#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _)
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0, j=0;
@@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT maxf=-FLT_MAX;
unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_max;
FLOAT_V_T_M1 v_res, v_min;
gvl = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, gvl);
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
if(inc_x == 1){
gvl = VSETVL(n);
@@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
j += gvl * 2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
if(*((FLOAT*)&v_res) > maxf)
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}else{
@@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
j += gvl * 2;
idx += inc_xv * 2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
if(*((FLOAT*)&v_res) > maxf)
maxf = *((FLOAT*)&v_res);
v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
j += gvl;
}
}
maxf = EXTRACT_FLOAT(v_res);
return(maxf);
}