update the amin function for riscv platform

improve the amin function with less vector instruction.
only verify it on QEMU.
This commit is contained in:
Wu Zhigang 2021-03-10 11:07:57 -08:00
parent ef0238ba2b
commit 8c6e532358
1 changed files with 64 additions and 132 deletions

View File

@ -58,90 +58,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0, j=0; BLASLONG i=0, j=0;
if (n <= 0 || inc_x <= 0) return(0.0); if (n <= 0 || inc_x <= 0)
return(0.0);
FLOAT minf=FLT_MAX; FLOAT minf=FLT_MAX;
unsigned int gvl = 0; unsigned int gvl = 0;
FLOAT_V_T v0, v1, v_min; volatile FLOAT_V_T v0;
FLOAT_V_T v_min, v1;
MASK_T mask0, mask1;
FLOAT zero = 0.0; FLOAT zero = 0.0;
if(inc_x == 1){ if(inc_x == 1){
gvl = vsetvli(n, RVV_EFLOAT, RVV_M); gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
if(gvl <= n/2){ if(gvl <= n/2){
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
for(i=0,j=0; i<n/(gvl*2); i++){ for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLEV_FLOAT(&x[j], gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
v1 = VLEV_FLOAT(&x[j+gvl], gvl); asm volatile(
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); "vle.v %1, (%4) \n\t"
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); "vle.v %2, (%5) \n\t"
#if defined(DOUBLE)
asm volatile( "vmflt.vf v0, %1, %3 \n\t"
"vor.vv v0, %1, %1\n\t" "vfrsub.vf %1, %1, %3, v0.t \n\t"
"vsetvli x0, %3, e64,m8 \n\t" "vmflt.vf v0, %2, %3 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t" "vfrsub.vf %2, %2, %3, v0.t \n\t"
:"+v"(v1)
:"v"(mask1), "f"(zero), "r"(gvl) "vfmin.vv %0, %0, %1 \n\t"
:"v0"); "vfmin.vv %0, %0, %2 \n\t"
#else :"+v"(v_min)
asm volatile( :"v"(v0), "v"(v1), "f"(zero), "r"(&x[j]), "r"(&x[j+gvl])
"vor.vv v0, %1, %1\n\t" :"v0"
"vsetvli x0, %3, e32,m8 \n\t" );
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
j += gvl*2; j += gvl*2;
} }
v1 = VFMVVF_FLOAT(FLT_MAX, gvl); v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
minf = v0[0]; minf = v0[0];
} }
for(;j<n;){ for(;j<n;){
gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
v0 = VLEV_FLOAT(&x[j], gvl); asm volatile(
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); "vle.v %0, (%2) \n\t"
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); "vmflt.vf v0, %0, %1 \n\t"
#if defined(DOUBLE) "vfrsub.vf %0, %0, %1, v0.t \n\t"
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0) :"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl) :"f"(zero), "r"(&x[j])
:"v0"); :"v0"
#else );
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v1 = VFMVVF_FLOAT(FLT_MAX, gvl); v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
v0 = VFREDMINVS_FLOAT(v0, v1, gvl); v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
if(v0[0] < minf) if(v0[0] < minf)
@ -151,54 +117,29 @@ asm volatile(
}else{ }else{
gvl = vsetvli(n, RVV_EFLOAT, RVV_M); gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG stride_x = inc_x * sizeof(FLOAT);
if(gvl <= n/2){ if(gvl <= n/2){
BLASLONG idx = 0, inc_xv = inc_x * gvl; BLASLONG idx = 0, inc_xv = inc_x * gvl;
v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
for(i=0,j=0; i<n/(gvl*2); i++){ for(i=0,j=0; i<n/(gvl*2); i++){
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
#if defined(DOUBLE)
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl); asm volatile(
mask1 = VMFLTVF_FLOAT(v1, 0, gvl); "vlse.v %1, (%4), %6 \n\t"
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); "vlse.v %2, (%5), %6 \n\t"
#if defined(DOUBLE)
asm volatile( "vmflt.vf v0, %1, %3 \n\t"
"vor.vv v0, %1, %1\n\t" "vfrsub.vf %1, %1, %3, v0.t \n\t"
"vsetvli x0, %3, e64,m8 \n\t" "vmflt.vf v0, %2, %3 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t" "vfrsub.vf %2, %2, %3, v0.t \n\t"
:"+v"(v1)
:"v"(mask1), "f"(zero), "r"(gvl) "vfmin.vv %0, %0, %1 \n\t"
:"v0"); "vfmin.vv %0, %0, %2 \n\t"
#else :"+v"(v_min)
asm volatile( :"v"(v0), "v"(v1), "f"(zero), "r"(&x[idx]), "r"(&x[idx+inc_xv]), "r"(stride_x)
"vor.vv v0, %1, %1\n\t" :"v0"
"vsetvli x0, %3, e32,m8 \n\t" );
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v1)
:"v"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
j += gvl*2; j += gvl*2;
idx += inc_xv*2; idx += inc_xv*2;
} }
@ -206,28 +147,18 @@ asm volatile(
v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
minf = v0[0]; minf = v0[0];
} }
for(;j<n;){ for(;j<n;){
gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); asm volatile(
mask0 = VMFLTVF_FLOAT(v0, 0, gvl); "vlse.v %0, (%2), %3 \n\t"
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); "vmflt.vf v0, %0, %1 \n\t"
#if defined(DOUBLE) "vfrsub.vf %0, %0, %1, v0.t \n\t"
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0) :"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl) :"f"(zero), "r"(&x[j*inc_x]), "r"(stride_x)
:"v0"); :"v0"
#else );
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
:"+v"(v0)
:"v"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v1 = VFMVVF_FLOAT(FLT_MAX, gvl); v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
v0 = VFREDMINVS_FLOAT(v0, v1, gvl); v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
if(v0[0] < minf) if(v0[0] < minf)
@ -235,6 +166,7 @@ asm volatile(
j += gvl; j += gvl;
} }
} }
return(minf); return(minf);
} }