From a3b0ef6596d51ecfb59b0a2f6a7b0d59bc4f18b4 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Thu, 1 Feb 2024 10:26:02 +0000 Subject: [PATCH] Restore riscv64 fixes from develop branch: dot product double precision accumulation, zscal NaN handling --- Makefile.prebuild | 1 + kernel/riscv64/dot.c | 10 ++++ kernel/riscv64/zscal_rvv.c | 104 +++++++--------------------------- kernel/riscv64/zscal_vector.c | 79 +------------------------- 4 files changed, 33 insertions(+), 161 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index b44b50039..b7d695a75 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -57,6 +57,7 @@ endif ifeq ($(TARGET), CK860FV) TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index bf55998ca..8ad493a2b 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + +#if defined(DSDOT) double dot = 0.0 ; +#else + FLOAT dot = 0.0 ; +#endif if ( n < 1 ) return(dot); while(i < n) { +#if defined(DSDOT) + dot += (double) y[iy] * (double) x[ix] ; +#else dot += y[iy] * x[ix] ; +#endif + ix += inc_x ; iy += inc_y ; i++ ; diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 2586c6036..ae79d9f9d 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -69,104 +69,42 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F size_t vlmax = VSETVL_MAX; FLOAT_VX2_T vx2; - if(da_r == 0.0 && da_i == 0.0) { + if(inc_x == 1) { - vr = VFMVVF_FLOAT(0.0, vlmax); - vi = VFMVVF_FLOAT(0.0, vlmax); - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - } - - } else if(da_r == 0.0) { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx2 = VLSEG_FLOAT(x, vl); vr = VGET_VX2(vx2, 0); vi = VGET_VX2(vx2, 1); - vt = VFMULVF_FLOAT(vi, -da_i, vl); - vi = VFMULVF_FLOAT(vr, da_i, vl); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); - - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - - } else if(da_i == 0.0) { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vr = VFMULVF_FLOAT(vr, da_r, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); + VSSEG_FLOAT(x, vx2, vl); } } else { - if(inc_x == 1) { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); - vx2 = VLSEG_FLOAT(x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 2034aafaa..536bbdf73 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(0.0, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - }else if(da_r == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else if(da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else{ + { gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl;