diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b6aec131e..5312f9ef0 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_zero; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_zero = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; @@ -162,6 +175,7 @@ asm volatile( //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -170,6 +184,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -185,6 +200,7 @@ asm volatile( //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -193,6 +209,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -205,17 +222,17 @@ asm volatile( j += gvl*2; ix += inc_xv*2; } - v0 = VFMVVF_FLOAT(0, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 53243ad56..ae2867ef8 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); MASK_T mask0, mask1; - FLOAT zero = 0.0; + FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int stride_y = inc_y * sizeof(FLOAT); @@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) j += gvl; } if(j > 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } } return(dot); diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index bd4d23eae..32ca8618b 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va0, va1, vy0, vy1; unsigned int gvl = 0; if(inc_y == 1){ - gvl = vsetvli(m, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m); if(gvl <= m/2){ for(k=0,j=0; k maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 608f19a00..5bcffece5 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index 44af7101b..42705f5de 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ @@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index e6e0e9f9f..3afa74dd6 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -113,26 +122,24 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -143,7 +150,7 @@ asm volatile( } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -154,7 +161,7 @@ asm volatile( //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -175,27 +182,25 @@ asm volatile( #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 62c95d973..ddb5eabde 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif #define RVV_M RVV_M8 @@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-1, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -119,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -143,7 +154,7 @@ asm volatile( //index where element greater than v_max mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); + v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -163,7 +174,7 @@ asm volatile( :"v0"); #endif */ - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); @@ -171,19 +182,19 @@ asm volatile( ix += inc_xv; } vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask0,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -206,7 +217,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -227,9 +238,8 @@ asm volatile( #endif */ v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - FLOAT cur_maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 38eccf1b5..6e328dc31 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + + gvl = VSETVL(n); v_min_index = VMVVX_UINT(0, gvl); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -120,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -144,7 +154,7 @@ asm volatile( //index where element less than v_min mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); + v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -164,27 +174,26 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, gvl); j += gvl; ix += inc_xv; } - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask0,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -207,7 +216,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -228,9 +237,8 @@ asm volatile( #endif */ v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - FLOAT cur_minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 4ef75452d..0fc59b74c 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 83c965bfa..8223fa87a 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ va = VLEV_FLOAT(&a_ptr[i], gvl); @@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ @@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); inc_xv = inc_x * gvl; for(k = 0; k < len / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 29e0e4b65..7229a48b1 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 +#define VFMULVV_FLOAT vfmul_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 +#define VFMULVV_FLOAT vfmul_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT temp2; FLOAT *a_ptr = a; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; @@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA temp2 = 0.0; if(j > 0){ i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ vy = VLEV_FLOAT(&y[i], gvl); @@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ ix = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix = 0; iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index a6c742b14..5cd65b225 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max = VFMVVF_FLOAT(0, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i maxf) - maxf = v_max[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; } return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 44a7cf1dc..9d567b3da 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n2/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 6fe12c76c..40cd9cd64 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B FLOAT temp_r2, temp_i2; FLOAT *a_ptr = a; unsigned int gvl = 0; - + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; @@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B ia = 0; i = 0; if(j > 0){ - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += temp_r1 * a_ptr[ja]; diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index b0ebfa5f4..5ac62eb80 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 #define ABS fabsf -#define MASK_T e32xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 -#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 -#define VMFIRSTM vmfirstm_e32xm4 -#define VFDIVVF_FLOAT vfdivvf_float32xm4 -#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#define MASK_T vbool8_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 +#define VMFIRSTM vmfirst_m_b8 +#define VFDIVVF_FLOAT vfdiv_vf_f32m4 +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 #define ABS fabs -#define MASK_T e64xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 -#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 -#define VMFIRSTM vmfirstm_e64xm4 -#define VFDIVVF_FLOAT vfdivvf_float64xm4 -#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#define MASK_T vbool16_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 +#define VMFIRSTM vmfirst_m_b16 +#define VFDIVVF_FLOAT vfdiv_vf_f64m4 +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T vr, v0, v_zero; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; MASK_T mask; BLASLONG index = 0; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); vr = VFMVVF_FLOAT(0, gvl); v_zero = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i