update for riscv V extension 1.0 and arbitrary shape gemm kernels
* all modules compile with latest tools and vector extension version * easily reconfigure for different architecture preferences * reduce LMUL to avoid register spills (note LMUL=8 uses a quarter of the register bank per variable!) * multiple test fixes - corner cases (zero/negative inputs), nrm2 numeric stability * added vectorised implementations for sum/zsum
This commit is contained in:
parent
6c1076e133
commit
efe0b84249
|
@ -91,8 +91,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#if defined(C910V)
|
||||
#include <riscv_vector.h>
|
||||
#if defined(C910V) || defined(__clang__) || defined(RVV_COMPATIBLE_GCC)
|
||||
# include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
# define EXTRACT_FLOAT(v) vfmv_f_s_f32m1_f32(v)
|
||||
#else
|
||||
# define EXTRACT_FLOAT(v) vfmv_f_s_f64m1_f64(v)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -70,12 +70,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
|
||||
static char *cpuname[] = {
|
||||
"RISCV64_GENERIC",
|
||||
"C910V"
|
||||
"C910V",
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -28,36 +28,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
# define ELEN 32
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDMAXVS_FLOAT JOIN(vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define VFABS_FLOAT JOIN(vfabs, _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
@ -65,103 +57,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT maxf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_max;
|
||||
FLOAT_V_T_M1 v_res, v_zero;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T v0, v1;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
FLOAT zero = 0.0;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
v_max = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
//maxf = v_res[0];
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
|
@ -169,94 +86,27 @@ asm volatile(
|
|||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
v_max = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
ix += inc_xv*2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
|
|
|
@ -26,232 +26,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
# define ELEN 32
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDMINVS_FLOAT JOIN(vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define VFABS_FLOAT JOIN(vfabs, _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
FLOAT minf=FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_min;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
minf = *x;
|
||||
x += inc_x;
|
||||
--n;
|
||||
if (n == 0) return(minf);
|
||||
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(minf, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
FLOAT zero = 0.0;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG idx = 0, inc_xv = inc_x * gvl;
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
//v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v1)
|
||||
:"vd"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
|
||||
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v1, v_res, gvl);
|
||||
j += gvl*2;
|
||||
idx += inc_xv*2;
|
||||
ix += inc_xv*2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
//v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vsetvli zero, zero, e8, m1\n\t"
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+vd"(v0)
|
||||
:"vd"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
return(minf);
|
||||
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
return(minf);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -28,35 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
# define ELEN 32
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDSUMVS_FLOAT JOIN(vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define VFABS_FLOAT JOIN(vfabs, _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define VFADDVV_FLOAT JOIN(vfadd, _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
@ -64,75 +58,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT asumf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_zero,v_sum;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T v0, v1, v_sum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
inc_xv += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
asumf += *((FLOAT*)&v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
asumf = EXTRACT_FLOAT(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
|
||||
|
|
|
@ -27,23 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
#else
|
||||
# define ELEN 32
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(vse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(vsse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMACCVF_FLOAT JOIN(vfmacc, _vf_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMULVF_FLOAT JOIN(vfmul, _vf_f, ELEN, LMUL, _)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
|
|
|
@ -25,26 +25,29 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
# define ELEN 32
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(vse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(vsse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMACCVF_FLOAT JOIN(vfmacc, _vf_f, ELEN, LMUL, _)
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0, j=0, jx=0, jy=0;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -25,22 +25,26 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VSEV_FLOAT vse_v_f32m8
|
||||
#define VSSEV_FLOAT vsse_v_f32m8
|
||||
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VSEV_FLOAT vse_v_f64m8
|
||||
#define VSSEV_FLOAT vsse_v_f64m8
|
||||
# define ELEN 32
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(vse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(vsse, ELEN, _v_f, ELEN, LMUL)
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,859 @@
|
|||
/*
|
||||
|
||||
AUTOGENERATED KERNEL
|
||||
Settings:
|
||||
LMUL=1
|
||||
M=8
|
||||
M_tail_scalar_from=2
|
||||
N=8
|
||||
complex=False
|
||||
conjugate=False
|
||||
cpu='zvl256b'
|
||||
force_acc_double=False
|
||||
index_type='BLASLONG'
|
||||
op='gemm'
|
||||
param_precision='double'
|
||||
reg_width_bits=256
|
||||
tail_policy='_ta'
|
||||
trace=False
|
||||
|
||||
Derived:
|
||||
ELEN_ACC=64
|
||||
ELEN_PARAM=64
|
||||
LMUL_ACC=1
|
||||
VFMACC='vfmacc_vf_f64m1_ta'
|
||||
VFMUL='vfmul_vf_f64m1_ta'
|
||||
VLEV='vle64_v_f64m1'
|
||||
VLSEV='vlse64_v_f64m1'
|
||||
VMACC_TO_ACC='vfmacc_vf_f64m1_ta'
|
||||
VMUL_TO_ACC='vfmul_vf_f64m1_ta'
|
||||
VSETVL='vsetvl_e64m1'
|
||||
VSEV='vse64_v_f64m1'
|
||||
VSSEV='vsse64_v_f64m1'
|
||||
acc_vector_t='vfloat64m1_t'
|
||||
output='dgemm_kernel_8x8_zvl256b.c'
|
||||
param_scalar_t='double'
|
||||
param_vector_t='vfloat64m1_t'
|
||||
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
|
||||
|
||||
{
|
||||
BLASLONG gvl = 0;
|
||||
BLASLONG m_top = 0;
|
||||
BLASLONG n_top = 0;
|
||||
|
||||
|
||||
// -- MAIN PASS
|
||||
|
||||
for (BLASLONG j=0; j<N/8; j+=1) {
|
||||
m_top = 0;
|
||||
BLASLONG gvl = vsetvl_e64m1(4);
|
||||
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
double B4 = B[bi+4];
|
||||
double B5 = B[bi+5];
|
||||
double B6 = B[bi+6];
|
||||
double B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = vfmul_vf_f64m1_ta( A1, B0, gvl);
|
||||
vfloat64m1_t result2 = vfmul_vf_f64m1_ta( A0, B1, gvl);
|
||||
vfloat64m1_t result3 = vfmul_vf_f64m1_ta( A1, B1, gvl);
|
||||
vfloat64m1_t result4 = vfmul_vf_f64m1_ta( A0, B2, gvl);
|
||||
vfloat64m1_t result5 = vfmul_vf_f64m1_ta( A1, B2, gvl);
|
||||
vfloat64m1_t result6 = vfmul_vf_f64m1_ta( A0, B3, gvl);
|
||||
vfloat64m1_t result7 = vfmul_vf_f64m1_ta( A1, B3, gvl);
|
||||
vfloat64m1_t result8 = vfmul_vf_f64m1_ta( A0, B4, gvl);
|
||||
vfloat64m1_t result9 = vfmul_vf_f64m1_ta( A1, B4, gvl);
|
||||
vfloat64m1_t result10 = vfmul_vf_f64m1_ta( A0, B5, gvl);
|
||||
vfloat64m1_t result11 = vfmul_vf_f64m1_ta( A1, B5, gvl);
|
||||
vfloat64m1_t result12 = vfmul_vf_f64m1_ta( A0, B6, gvl);
|
||||
vfloat64m1_t result13 = vfmul_vf_f64m1_ta( A1, B6, gvl);
|
||||
vfloat64m1_t result14 = vfmul_vf_f64m1_ta( A0, B7, gvl);
|
||||
vfloat64m1_t result15 = vfmul_vf_f64m1_ta( A1, B7, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
B4 = B[bi+4];
|
||||
B5 = B[bi+5];
|
||||
B6 = B[bi+6];
|
||||
B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
result1 = vfmacc_vf_f64m1_ta( result1, B0, A1, gvl);
|
||||
result2 = vfmacc_vf_f64m1_ta( result2, B1, A0, gvl);
|
||||
result3 = vfmacc_vf_f64m1_ta( result3, B1, A1, gvl);
|
||||
result4 = vfmacc_vf_f64m1_ta( result4, B2, A0, gvl);
|
||||
result5 = vfmacc_vf_f64m1_ta( result5, B2, A1, gvl);
|
||||
result6 = vfmacc_vf_f64m1_ta( result6, B3, A0, gvl);
|
||||
result7 = vfmacc_vf_f64m1_ta( result7, B3, A1, gvl);
|
||||
result8 = vfmacc_vf_f64m1_ta( result8, B4, A0, gvl);
|
||||
result9 = vfmacc_vf_f64m1_ta( result9, B4, A1, gvl);
|
||||
result10 = vfmacc_vf_f64m1_ta( result10, B5, A0, gvl);
|
||||
result11 = vfmacc_vf_f64m1_ta( result11, B5, A1, gvl);
|
||||
result12 = vfmacc_vf_f64m1_ta( result12, B6, A0, gvl);
|
||||
result13 = vfmacc_vf_f64m1_ta( result13, B6, A1, gvl);
|
||||
result14 = vfmacc_vf_f64m1_ta( result14, B7, A0, gvl);
|
||||
result15 = vfmacc_vf_f64m1_ta( result15, B7, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c2 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c3 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c4 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c5 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c6 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c7 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c8 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c9 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c10 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c11 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c12 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c13 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c14 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c15 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
c1 = vfmacc_vf_f64m1_ta( c1, alpha, result1, gvl );
|
||||
c2 = vfmacc_vf_f64m1_ta( c2, alpha, result2, gvl );
|
||||
c3 = vfmacc_vf_f64m1_ta( c3, alpha, result3, gvl );
|
||||
c4 = vfmacc_vf_f64m1_ta( c4, alpha, result4, gvl );
|
||||
c5 = vfmacc_vf_f64m1_ta( c5, alpha, result5, gvl );
|
||||
c6 = vfmacc_vf_f64m1_ta( c6, alpha, result6, gvl );
|
||||
c7 = vfmacc_vf_f64m1_ta( c7, alpha, result7, gvl );
|
||||
c8 = vfmacc_vf_f64m1_ta( c8, alpha, result8, gvl );
|
||||
c9 = vfmacc_vf_f64m1_ta( c9, alpha, result9, gvl );
|
||||
c10 = vfmacc_vf_f64m1_ta( c10, alpha, result10, gvl );
|
||||
c11 = vfmacc_vf_f64m1_ta( c11, alpha, result11, gvl );
|
||||
c12 = vfmacc_vf_f64m1_ta( c12, alpha, result12, gvl );
|
||||
c13 = vfmacc_vf_f64m1_ta( c13, alpha, result13, gvl );
|
||||
c14 = vfmacc_vf_f64m1_ta( c14, alpha, result14, gvl );
|
||||
c15 = vfmacc_vf_f64m1_ta( c15, alpha, result15, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c15, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for main pass
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
double B4 = B[bi+4];
|
||||
double B5 = B[bi+5];
|
||||
double B6 = B[bi+6];
|
||||
double B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = vfmul_vf_f64m1_ta( A0, B1, gvl);
|
||||
vfloat64m1_t result2 = vfmul_vf_f64m1_ta( A0, B2, gvl);
|
||||
vfloat64m1_t result3 = vfmul_vf_f64m1_ta( A0, B3, gvl);
|
||||
vfloat64m1_t result4 = vfmul_vf_f64m1_ta( A0, B4, gvl);
|
||||
vfloat64m1_t result5 = vfmul_vf_f64m1_ta( A0, B5, gvl);
|
||||
vfloat64m1_t result6 = vfmul_vf_f64m1_ta( A0, B6, gvl);
|
||||
vfloat64m1_t result7 = vfmul_vf_f64m1_ta( A0, B7, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
B4 = B[bi+4];
|
||||
B5 = B[bi+5];
|
||||
B6 = B[bi+6];
|
||||
B7 = B[bi+7];
|
||||
bi += 8;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
result1 = vfmacc_vf_f64m1_ta( result1, B1, A0, gvl);
|
||||
result2 = vfmacc_vf_f64m1_ta( result2, B2, A0, gvl);
|
||||
result3 = vfmacc_vf_f64m1_ta( result3, B3, A0, gvl);
|
||||
result4 = vfmacc_vf_f64m1_ta( result4, B4, A0, gvl);
|
||||
result5 = vfmacc_vf_f64m1_ta( result5, B5, A0, gvl);
|
||||
result6 = vfmacc_vf_f64m1_ta( result6, B6, A0, gvl);
|
||||
result7 = vfmacc_vf_f64m1_ta( result7, B7, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c1 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c2 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c3 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c4 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c5 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c6 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c7 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
c1 = vfmacc_vf_f64m1_ta( c1, alpha, result1, gvl );
|
||||
c2 = vfmacc_vf_f64m1_ta( c2, alpha, result2, gvl );
|
||||
c3 = vfmacc_vf_f64m1_ta( c3, alpha, result3, gvl );
|
||||
c4 = vfmacc_vf_f64m1_ta( c4, alpha, result4, gvl );
|
||||
c5 = vfmacc_vf_f64m1_ta( c5, alpha, result5, gvl );
|
||||
c6 = vfmacc_vf_f64m1_ta( c6, alpha, result6, gvl );
|
||||
c7 = vfmacc_vf_f64m1_ta( c7, alpha, result7, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c7, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
double result8 = 0;
|
||||
double result9 = 0;
|
||||
double result10 = 0;
|
||||
double result11 = 0;
|
||||
double result12 = 0;
|
||||
double result13 = 0;
|
||||
double result14 = 0;
|
||||
double result15 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
result2+=A[ai+0]*B[bi+1];
|
||||
result3+=A[ai+1]*B[bi+1];
|
||||
result4+=A[ai+0]*B[bi+2];
|
||||
result5+=A[ai+1]*B[bi+2];
|
||||
result6+=A[ai+0]*B[bi+3];
|
||||
result7+=A[ai+1]*B[bi+3];
|
||||
result8+=A[ai+0]*B[bi+4];
|
||||
result9+=A[ai+1]*B[bi+4];
|
||||
result10+=A[ai+0]*B[bi+5];
|
||||
result11+=A[ai+1]*B[bi+5];
|
||||
result12+=A[ai+0]*B[bi+6];
|
||||
result13+=A[ai+1]*B[bi+6];
|
||||
result14+=A[ai+0]*B[bi+7];
|
||||
result15+=A[ai+1]*B[bi+7];
|
||||
ai+=2;
|
||||
bi+=8;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
C[ci+1*ldc+0] += alpha * result2;
|
||||
C[ci+1*ldc+1] += alpha * result3;
|
||||
C[ci+2*ldc+0] += alpha * result4;
|
||||
C[ci+2*ldc+1] += alpha * result5;
|
||||
C[ci+3*ldc+0] += alpha * result6;
|
||||
C[ci+3*ldc+1] += alpha * result7;
|
||||
C[ci+4*ldc+0] += alpha * result8;
|
||||
C[ci+4*ldc+1] += alpha * result9;
|
||||
C[ci+5*ldc+0] += alpha * result10;
|
||||
C[ci+5*ldc+1] += alpha * result11;
|
||||
C[ci+6*ldc+0] += alpha * result12;
|
||||
C[ci+6*ldc+1] += alpha * result13;
|
||||
C[ci+7*ldc+0] += alpha * result14;
|
||||
C[ci+7*ldc+1] += alpha * result15;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+0]*B[bi+1];
|
||||
result2+=A[ai+0]*B[bi+2];
|
||||
result3+=A[ai+0]*B[bi+3];
|
||||
result4+=A[ai+0]*B[bi+4];
|
||||
result5+=A[ai+0]*B[bi+5];
|
||||
result6+=A[ai+0]*B[bi+6];
|
||||
result7+=A[ai+0]*B[bi+7];
|
||||
ai+=1;
|
||||
bi+=8;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+1*ldc+0] += alpha * result1;
|
||||
C[ci+2*ldc+0] += alpha * result2;
|
||||
C[ci+3*ldc+0] += alpha * result3;
|
||||
C[ci+4*ldc+0] += alpha * result4;
|
||||
C[ci+5*ldc+0] += alpha * result5;
|
||||
C[ci+6*ldc+0] += alpha * result6;
|
||||
C[ci+7*ldc+0] += alpha * result7;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 8;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for N=4
|
||||
|
||||
if( N & 4 ) {
|
||||
gvl = vsetvl_e64m1(4);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = vfmul_vf_f64m1_ta( A1, B0, gvl);
|
||||
vfloat64m1_t result2 = vfmul_vf_f64m1_ta( A0, B1, gvl);
|
||||
vfloat64m1_t result3 = vfmul_vf_f64m1_ta( A1, B1, gvl);
|
||||
vfloat64m1_t result4 = vfmul_vf_f64m1_ta( A0, B2, gvl);
|
||||
vfloat64m1_t result5 = vfmul_vf_f64m1_ta( A1, B2, gvl);
|
||||
vfloat64m1_t result6 = vfmul_vf_f64m1_ta( A0, B3, gvl);
|
||||
vfloat64m1_t result7 = vfmul_vf_f64m1_ta( A1, B3, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
result1 = vfmacc_vf_f64m1_ta( result1, B0, A1, gvl);
|
||||
result2 = vfmacc_vf_f64m1_ta( result2, B1, A0, gvl);
|
||||
result3 = vfmacc_vf_f64m1_ta( result3, B1, A1, gvl);
|
||||
result4 = vfmacc_vf_f64m1_ta( result4, B2, A0, gvl);
|
||||
result5 = vfmacc_vf_f64m1_ta( result5, B2, A1, gvl);
|
||||
result6 = vfmacc_vf_f64m1_ta( result6, B3, A0, gvl);
|
||||
result7 = vfmacc_vf_f64m1_ta( result7, B3, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c2 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c3 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c4 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c5 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c6 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c7 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
c1 = vfmacc_vf_f64m1_ta( c1, alpha, result1, gvl );
|
||||
c2 = vfmacc_vf_f64m1_ta( c2, alpha, result2, gvl );
|
||||
c3 = vfmacc_vf_f64m1_ta( c3, alpha, result3, gvl );
|
||||
c4 = vfmacc_vf_f64m1_ta( c4, alpha, result4, gvl );
|
||||
c5 = vfmacc_vf_f64m1_ta( c5, alpha, result5, gvl );
|
||||
c6 = vfmacc_vf_f64m1_ta( c6, alpha, result6, gvl );
|
||||
c7 = vfmacc_vf_f64m1_ta( c7, alpha, result7, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c7, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
double B2 = B[bi+2];
|
||||
double B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = vfmul_vf_f64m1_ta( A0, B1, gvl);
|
||||
vfloat64m1_t result2 = vfmul_vf_f64m1_ta( A0, B2, gvl);
|
||||
vfloat64m1_t result3 = vfmul_vf_f64m1_ta( A0, B3, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
B2 = B[bi+2];
|
||||
B3 = B[bi+3];
|
||||
bi += 4;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
result1 = vfmacc_vf_f64m1_ta( result1, B1, A0, gvl);
|
||||
result2 = vfmacc_vf_f64m1_ta( result2, B2, A0, gvl);
|
||||
result3 = vfmacc_vf_f64m1_ta( result3, B3, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c1 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c2 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c3 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
c1 = vfmacc_vf_f64m1_ta( c1, alpha, result1, gvl );
|
||||
c2 = vfmacc_vf_f64m1_ta( c2, alpha, result2, gvl );
|
||||
c3 = vfmacc_vf_f64m1_ta( c3, alpha, result3, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c3, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
double result4 = 0;
|
||||
double result5 = 0;
|
||||
double result6 = 0;
|
||||
double result7 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
result2+=A[ai+0]*B[bi+1];
|
||||
result3+=A[ai+1]*B[bi+1];
|
||||
result4+=A[ai+0]*B[bi+2];
|
||||
result5+=A[ai+1]*B[bi+2];
|
||||
result6+=A[ai+0]*B[bi+3];
|
||||
result7+=A[ai+1]*B[bi+3];
|
||||
ai+=2;
|
||||
bi+=4;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
C[ci+1*ldc+0] += alpha * result2;
|
||||
C[ci+1*ldc+1] += alpha * result3;
|
||||
C[ci+2*ldc+0] += alpha * result4;
|
||||
C[ci+2*ldc+1] += alpha * result5;
|
||||
C[ci+3*ldc+0] += alpha * result6;
|
||||
C[ci+3*ldc+1] += alpha * result7;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+0]*B[bi+1];
|
||||
result2+=A[ai+0]*B[bi+2];
|
||||
result3+=A[ai+0]*B[bi+3];
|
||||
ai+=1;
|
||||
bi+=4;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+1*ldc+0] += alpha * result1;
|
||||
C[ci+2*ldc+0] += alpha * result2;
|
||||
C[ci+3*ldc+0] += alpha * result3;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 4;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for N=2
|
||||
|
||||
if( N & 2 ) {
|
||||
gvl = vsetvl_e64m1(4);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = vfmul_vf_f64m1_ta( A1, B0, gvl);
|
||||
vfloat64m1_t result2 = vfmul_vf_f64m1_ta( A0, B1, gvl);
|
||||
vfloat64m1_t result3 = vfmul_vf_f64m1_ta( A1, B1, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
result1 = vfmacc_vf_f64m1_ta( result1, B0, A1, gvl);
|
||||
result2 = vfmacc_vf_f64m1_ta( result2, B1, A0, gvl);
|
||||
result3 = vfmacc_vf_f64m1_ta( result3, B1, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
|
||||
vfloat64m1_t c2 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c3 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
c1 = vfmacc_vf_f64m1_ta( c1, alpha, result1, gvl );
|
||||
c2 = vfmacc_vf_f64m1_ta( c2, alpha, result2, gvl );
|
||||
c3 = vfmacc_vf_f64m1_ta( c3, alpha, result3, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
|
||||
vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c3, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
double B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = vfmul_vf_f64m1_ta( A0, B1, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
B1 = B[bi+1];
|
||||
bi += 2;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
result1 = vfmacc_vf_f64m1_ta( result1, B1, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
|
||||
vfloat64m1_t c1 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
c1 = vfmacc_vf_f64m1_ta( c1, alpha, result1, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
|
||||
vse64_v_f64m1( &C[ci], c1, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
double result2 = 0;
|
||||
double result3 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
result2+=A[ai+0]*B[bi+1];
|
||||
result3+=A[ai+1]*B[bi+1];
|
||||
ai+=2;
|
||||
bi+=2;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
C[ci+1*ldc+0] += alpha * result2;
|
||||
C[ci+1*ldc+1] += alpha * result3;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+0]*B[bi+1];
|
||||
ai+=1;
|
||||
bi+=2;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+1*ldc+0] += alpha * result1;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 2;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// -- tails for N=1
|
||||
|
||||
if( N & 1 ) {
|
||||
gvl = vsetvl_e64m1(4);
|
||||
m_top = 0;
|
||||
|
||||
for (BLASLONG i=0; i<M/8; i+=1) {
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
vfloat64m1_t A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
vfloat64m1_t result1 = vfmul_vf_f64m1_ta( A1, B0, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
A1 = vle64_v_f64m1( &A[ai+1*gvl], gvl );
|
||||
ai += 8;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
result1 = vfmacc_vf_f64m1_ta( result1, B0, A1, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl); ci += gvl;
|
||||
vfloat64m1_t c1 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
c1 = vfmacc_vf_f64m1_ta( c1, alpha, result1, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
|
||||
vse64_v_f64m1( &C[ci], c1, gvl);
|
||||
m_top += 8;
|
||||
}
|
||||
|
||||
|
||||
if( M & 4 ) {
|
||||
gvl = vsetvl_e64m1(4);
|
||||
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
double B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
vfloat64m1_t A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
vfloat64m1_t result0 = vfmul_vf_f64m1_ta( A0, B0, gvl);
|
||||
|
||||
for(BLASLONG k=1; k<K; k++) {
|
||||
B0 = B[bi+0];
|
||||
bi += 1;
|
||||
|
||||
A0 = vle64_v_f64m1( &A[ai+0*gvl], gvl );
|
||||
ai += 4;
|
||||
|
||||
result0 = vfmacc_vf_f64m1_ta( result0, B0, A0, gvl);
|
||||
}
|
||||
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
|
||||
vfloat64m1_t c0 = vle64_v_f64m1( &C[ci], gvl);
|
||||
c0 = vfmacc_vf_f64m1_ta( c0, alpha, result0, gvl );
|
||||
|
||||
ci=n_top*ldc+m_top;
|
||||
|
||||
vse64_v_f64m1( &C[ci], c0, gvl);
|
||||
m_top += 4;
|
||||
}
|
||||
|
||||
|
||||
if( M & 2 ) {
|
||||
double result0 = 0;
|
||||
double result1 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
result1+=A[ai+1]*B[bi+0];
|
||||
ai+=2;
|
||||
bi+=1;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
C[ci+0*ldc+1] += alpha * result1;
|
||||
m_top+=2;
|
||||
}
|
||||
|
||||
|
||||
if( M & 1 ) {
|
||||
double result0 = 0;
|
||||
BLASLONG ai=m_top*K;
|
||||
BLASLONG bi=n_top*K;
|
||||
|
||||
for(BLASLONG k=0; k<K; k++) {
|
||||
result0+=A[ai+0]*B[bi+0];
|
||||
ai+=1;
|
||||
bi+=1;
|
||||
}
|
||||
|
||||
BLASLONG ci=n_top*ldc+m_top;
|
||||
C[ci+0*ldc+0] += alpha * result0;
|
||||
m_top+=1;
|
||||
}
|
||||
|
||||
n_top += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
BLASLONG ix=0,iy=0;
|
||||
double dot = 0.0 ;
|
||||
|
||||
if ( n < 0 ) return(dot);
|
||||
if ( n < 1 ) return(dot);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -31,10 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
|
@ -44,9 +43,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
|
@ -63,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
BLASLONG i=0, j=0;
|
||||
double dot = 0.0 ;
|
||||
|
||||
if ( n < 0 ) return(dot);
|
||||
if ( n < 1 ) return(dot);
|
||||
|
||||
FLOAT_V_T vr, vx, vy;
|
||||
unsigned int gvl = 0;
|
||||
|
@ -83,7 +81,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
|
@ -94,12 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}else if(inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
int stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
vy = VLEV_FLOAT(&y[j], gvl);
|
||||
|
@ -108,8 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
|
@ -120,13 +117,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
int stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
|
||||
|
@ -135,8 +131,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
|
@ -147,14 +142,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
int stride_x = inc_x * sizeof(FLOAT);
|
||||
int stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
|
||||
|
@ -163,8 +157,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
if(j > 0){
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
//tail
|
||||
if(j < n){
|
||||
|
@ -175,8 +168,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
|
||||
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
dot += (double)VFMVFS_FLOAT(v_res);
|
||||
|
||||
dot += (double)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
return(dot);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#endif
|
||||
|
||||
|
|
|
@ -28,13 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
|
@ -42,12 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VFMULVV_FLOAT vfmul_vv_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
|
@ -58,17 +54,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i = 0, j = 0, k = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT *a_ptr = a;
|
||||
BLASLONG i = 0, j = 0, k = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT *a_ptr = a;
|
||||
FLOAT temp;
|
||||
|
||||
FLOAT_V_T va, vr, vx;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
for(i = 0; i < n; i++){
|
||||
|
@ -82,7 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
j += gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp = (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
temp = (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
if(j < m){
|
||||
gvl = VSETVL(m-j);
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
|
@ -90,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
vr = VFMULVV_FLOAT(va, vx, gvl);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp += (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
temp += (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
y[iy] += alpha * temp;
|
||||
iy += inc_y;
|
||||
|
@ -98,7 +93,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
}
|
||||
}else{
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for(i = 0; i < n; i++){
|
||||
gvl = VSETVL(m);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
|
@ -110,10 +104,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
ix += inc_x * gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp = (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
temp = (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
if(j < m){
|
||||
gvl = VSETVL(m-j);
|
||||
va = VLEV_FLOAT(&a_ptr[j], gvl);
|
||||
|
@ -121,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
vr = VFMULVV_FLOAT(va, vx, gvl);
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp += (FLOAT)VFMVFS_FLOAT(v_res);
|
||||
temp += (FLOAT)EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
y[iy] += alpha * temp;
|
||||
iy += inc_y;
|
||||
|
|
|
@ -0,0 +1,669 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import sys, os
|
||||
import contextlib
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def ERROR(*args, **kwargs):
|
||||
print(*args, file=sys.stderr, **kwargs)
|
||||
sys.exit(-1)
|
||||
|
||||
class Target(object):
|
||||
def __init__( self, out, mappings, initial_level=0, tab_width=4 ):
|
||||
self._level = initial_level
|
||||
self._tab_width = tab_width
|
||||
self._out = out
|
||||
self._mappings = mappings
|
||||
|
||||
@contextlib.contextmanager
|
||||
def map( self, **items ):
|
||||
old_mappings = self._mappings
|
||||
self._mappings = dict(old_mappings, **items)
|
||||
yield self._mappings
|
||||
self._mappings = old_mappings
|
||||
|
||||
@contextlib.contextmanager
|
||||
def block( self, start=None, end=None, **args ):
|
||||
with self.map(**args):
|
||||
if start is not None:
|
||||
self.write();
|
||||
self.write(start)
|
||||
self._level += 1
|
||||
yield self._level
|
||||
self._level -= 1
|
||||
if end is not None:
|
||||
self.write(end)
|
||||
self.write()
|
||||
|
||||
def write( self, fmt=None, *args, **kwargs ):
|
||||
if fmt is not None:
|
||||
mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings
|
||||
self._out(self._indent_str() + fmt.format(*args, **mappings))
|
||||
else:
|
||||
self._out("")
|
||||
|
||||
def _indent_str( self ):
|
||||
return ' ' * (self._level * self._tab_width)
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_trmm_block( dest ):
|
||||
dest.write("{index_type} pass_K = K;")
|
||||
dest.write("#ifdef LEFT")
|
||||
with dest.block():
|
||||
dest.write("{index_type} off = offset + m_top;")
|
||||
dest.write("#else")
|
||||
with dest.block():
|
||||
dest.write("{index_type} off = -offset + n_top;")
|
||||
dest.write("#endif")
|
||||
|
||||
dest.write("#ifdef BACKWARDS")
|
||||
with dest.block():
|
||||
dest.write("ai += off*{M}{elt_size};")
|
||||
dest.write("bi += off*{N}{elt_size};")
|
||||
dest.write("pass_K -= off;")
|
||||
dest.write("#else")
|
||||
with dest.block():
|
||||
dest.write("#ifdef LEFT")
|
||||
with dest.block():
|
||||
dest.write("pass_K = off + {M};")
|
||||
dest.write("#else")
|
||||
with dest.block():
|
||||
dest.write("pass_K = off + {N};")
|
||||
dest.write("#endif")
|
||||
dest.write("#endif")
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ):
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
|
||||
|
||||
with dest.map(
|
||||
M=M,
|
||||
N=N,
|
||||
):
|
||||
dest.write("{index_type} ai=m_top*K{elt_size};")
|
||||
dest.write("{index_type} bi=n_top*K{elt_size};")
|
||||
if TRMM:
|
||||
generate_trmm_block( dest )
|
||||
|
||||
for i in range(N):
|
||||
dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i)
|
||||
dest.write("bi += {N};")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
|
||||
dest.write("ai += {M};")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j)
|
||||
|
||||
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
|
||||
for i in range(N):
|
||||
dest.write("B{i} = B[bi+{i}];", i=i )
|
||||
dest.write("bi += {N};")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
|
||||
|
||||
dest.write("ai += {M};")
|
||||
dest.write()
|
||||
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i )
|
||||
|
||||
dest.write()
|
||||
dest.write("{index_type} ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
if narrow_result:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i)
|
||||
|
||||
if not TRMM:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment)
|
||||
|
||||
if narrow_result:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
if TRMM:
|
||||
dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx)
|
||||
else:
|
||||
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx)
|
||||
else:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
if TRMM:
|
||||
dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx)
|
||||
else:
|
||||
dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx)
|
||||
|
||||
|
||||
if not TRMM:
|
||||
dest.write()
|
||||
dest.write("ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment)
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ):
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
|
||||
|
||||
if narrow_result:
|
||||
raise RuntimeError("wide accumulator not supported for generated complex kernels")
|
||||
# we could, but we run out of registers really really fast
|
||||
|
||||
with dest.map(
|
||||
M=M,
|
||||
N=N,
|
||||
):
|
||||
dest.write("{index_type} ai=m_top*K*2;")
|
||||
dest.write("{index_type} bi=n_top*K*2;")
|
||||
if TRMM:
|
||||
generate_trmm_block( dest )
|
||||
|
||||
for i in range(N):
|
||||
dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i)
|
||||
dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i)
|
||||
dest.write("bi += {N}*2;")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
dest.write("ai += {M}*2;")
|
||||
dest.write()
|
||||
|
||||
|
||||
accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
|
||||
dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k",
|
||||
a_regs=a_regs*2, accumulation_regs=accumulation_regs*2
|
||||
)
|
||||
pass_regs = (accumulation_regs + a_regs)*2
|
||||
tmp_regs = 32-pass_regs
|
||||
if tmp_regs < 2:
|
||||
raise RuntimeError("Complex kernel would use too many registers!")
|
||||
|
||||
dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs)
|
||||
|
||||
tmp_unroll_i = min(tmp_regs, a_regs)
|
||||
tmp_unroll_j = N
|
||||
while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j:
|
||||
tmp_unroll_j = int(tmp_unroll_j / 2)
|
||||
|
||||
if tmp_unroll_i < a_regs or tmp_unroll_j < N:
|
||||
dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i)
|
||||
|
||||
for tj in range(0, N, tmp_unroll_j):
|
||||
for ti in range(0, a_regs, tmp_unroll_i):
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
if ti == 0 and tj==0:
|
||||
dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
|
||||
dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
|
||||
else:
|
||||
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
|
||||
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
|
||||
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
|
||||
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;")
|
||||
dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;")
|
||||
|
||||
with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
|
||||
for i in range(N):
|
||||
dest.write("B{i}r = B[bi+{i}*2+0];", i=i)
|
||||
dest.write("B{i}i = B[bi+{i}*2+1];", i=i)
|
||||
dest.write("bi += {N}*2;")
|
||||
dest.write()
|
||||
|
||||
for i in range(a_regs):
|
||||
dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
|
||||
|
||||
dest.write("ai += {M}*2;")
|
||||
dest.write()
|
||||
|
||||
|
||||
for tj in range(0, N, tmp_unroll_j):
|
||||
for ti in range(0, a_regs, tmp_unroll_i):
|
||||
# note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign
|
||||
# so accumulating them directly to ACC would lose precision when ACC is larger
|
||||
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
|
||||
dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
|
||||
dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
|
||||
for j in range(tj, tj+tmp_unroll_j):
|
||||
for i in range(ti, ti+tmp_unroll_i):
|
||||
with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
|
||||
dest.write("ACC{dest}r = vfadd( ACC{dest}r, tmp{tmp}r, gvl);")
|
||||
dest.write("ACC{dest}i = vfadd( ACC{dest}i, tmp{tmp}i, gvl);")
|
||||
|
||||
dest.write()
|
||||
dest.write("{index_type} ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
if TRMM:
|
||||
for i in range(a_regs):
|
||||
with dest.map(idx=j*a_regs+i):
|
||||
dest.write("{param_vector_t} C{idx}r = vfmul( ACC{idx}r, alphar, gvl );")
|
||||
dest.write("{param_vector_t} C{idx}i = vfmul( ACC{idx}i, alphar, gvl );")
|
||||
else:
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
with dest.map(idx=j*a_regs+i, increment=increment):
|
||||
dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );")
|
||||
dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );")
|
||||
dest.write("{increment}")
|
||||
|
||||
if not TRMM:
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
with dest.map(idx=j*a_regs+i):
|
||||
dest.write("C{idx}r = vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );")
|
||||
dest.write("C{idx}i = vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );")
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
with dest.map(idx=j*a_regs+i):
|
||||
dest.write("C{idx}r = vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );")
|
||||
dest.write("C{idx}i = vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );")
|
||||
|
||||
if not TRMM:
|
||||
dest.write()
|
||||
dest.write("ci=n_top*ldc+m_top;")
|
||||
dest.write()
|
||||
|
||||
for j in range(N):
|
||||
for i in range(a_regs):
|
||||
idx = j*a_regs+i
|
||||
increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
|
||||
if idx == N*a_regs-1:
|
||||
increment = ''
|
||||
with dest.map(idx=j*a_regs+i, increment=increment):
|
||||
dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);")
|
||||
dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);")
|
||||
dest.write("{increment}")
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_gemm_kernel( settings, OUTPUT ):
|
||||
if settings['conjugate'].value:
|
||||
ERROR('conjugate gemm not yet supported')
|
||||
|
||||
is_complex = settings['complex'].value
|
||||
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
|
||||
dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings })
|
||||
|
||||
M = settings['M'].value
|
||||
N = settings['N'].value
|
||||
vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
|
||||
a_regs = max(int(M/vlenmax), 1)
|
||||
|
||||
accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
|
||||
required_regs = accumulation_regs + a_regs
|
||||
if is_complex:
|
||||
required_regs = required_regs * 2 + 2
|
||||
dest.write('''
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define S0 1
|
||||
#define S1 -1
|
||||
#define S2 1
|
||||
#define S3 1
|
||||
#define VFMACC_RR vfmsac{tail_policy}
|
||||
#define VFMACC_RI vfmacc{tail_policy}
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define S0 1
|
||||
#define S1 1
|
||||
#define S2 1
|
||||
#define S3 -1
|
||||
#define VFMACC_RR vfmacc{tail_policy}
|
||||
#define VFMACC_RI vfmsac{tail_policy}
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define S0 1
|
||||
#define S1 1
|
||||
#define S2 -1
|
||||
#define S3 1
|
||||
#define VFMACC_RR vfmacc{tail_policy}
|
||||
#define VFMACC_RI vfnmsac{tail_policy}
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define S0 1
|
||||
#define S1 -1
|
||||
#define S2 -1
|
||||
#define S3 -1
|
||||
#define VFMACC_RR vfmsac{tail_policy}
|
||||
#define VFMACC_RI vfnmacc{tail_policy}
|
||||
#endif
|
||||
'''.format(tail_policy=settings['tail_policy'].value))
|
||||
|
||||
|
||||
if required_regs > 32:
|
||||
raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format(
|
||||
required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '')
|
||||
))
|
||||
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
if TRMM:
|
||||
with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"):
|
||||
dest.write("#define BACKWARDS")
|
||||
|
||||
dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})",
|
||||
alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'),
|
||||
trmm = (', BLASLONG offset' if TRMM else '')
|
||||
)
|
||||
|
||||
with dest.block("{{", "}}", elt_size='*2' if is_complex else ''):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);")
|
||||
dest.write("{index_type} gvl = 0;")
|
||||
dest.write("{index_type} m_top = 0;")
|
||||
dest.write("{index_type} n_top = 0;")
|
||||
|
||||
dest.write()
|
||||
dest.write()
|
||||
dest.write("// -- MAIN PASS")
|
||||
|
||||
with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"):
|
||||
dest.write("m_top = 0;")
|
||||
dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
|
||||
dest.write()
|
||||
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
|
||||
generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs )
|
||||
dest.write( "m_top += {M};" )
|
||||
|
||||
dest.write()
|
||||
dest.write()
|
||||
dest.write("// -- tails for main pass")
|
||||
generate_M_tails( dest, settings, M, N )
|
||||
|
||||
dest.write( "n_top += {N};" )
|
||||
|
||||
|
||||
N_tail = int(N/2)
|
||||
while( N_tail > 0 ):
|
||||
with dest.map(N=N_tail):
|
||||
dest.write()
|
||||
dest.write()
|
||||
dest.write("// -- tails for N={N}")
|
||||
with dest.block("if( N & {N} ) {{", "}}" ):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
|
||||
dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
|
||||
dest.write("m_top = 0;")
|
||||
with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
|
||||
generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs )
|
||||
dest.write("m_top += {M};")
|
||||
|
||||
generate_M_tails( dest, settings, M, N_tail )
|
||||
dest.write("n_top += {N};")
|
||||
N_tail = int(N_tail/2)
|
||||
|
||||
dest.write("return 0;");
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def generate_M_tails( dest, settings, M, N ):
|
||||
M_tail = int(M/2)
|
||||
M_tail_min = settings['M_tail_scalar_from'].value
|
||||
vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
|
||||
TRMM = (settings['op'].value == 'trmm')
|
||||
is_complex = settings['complex'].value
|
||||
generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
|
||||
|
||||
while( M_tail > M_tail_min ):
|
||||
with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
|
||||
a_regs = max( 1, int(M_tail/vlenmax) )
|
||||
vlen = int(M_tail/a_regs)
|
||||
dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen)
|
||||
|
||||
generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs )
|
||||
dest.write( "m_top += {M_tail};" )
|
||||
|
||||
M_tail = int( M_tail / 2 )
|
||||
|
||||
while( M_tail > 0 ):
|
||||
with dest.block("if( M & {M_tail} ) {{", "}}",
|
||||
M_tail=M_tail,
|
||||
N=N,
|
||||
result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value)
|
||||
):
|
||||
if settings['trace'].value:
|
||||
dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
|
||||
for r in range(M_tail * N * (2 if is_complex else 1)):
|
||||
dest.write("{result_t} result{r} = 0;",
|
||||
r=r
|
||||
)
|
||||
|
||||
dest.write("{index_type} ai=m_top*K{elt_size};")
|
||||
dest.write("{index_type} bi=n_top*K{elt_size};")
|
||||
|
||||
if TRMM:
|
||||
with dest.map(M=M_tail, N=N):
|
||||
generate_trmm_block( dest )
|
||||
|
||||
with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ):
|
||||
for ki in range( N ):
|
||||
for kj in range( M_tail ):
|
||||
if is_complex:
|
||||
dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format(
|
||||
dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2
|
||||
))
|
||||
dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format(
|
||||
dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2
|
||||
))
|
||||
else:
|
||||
dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format(
|
||||
dest=ki*M_tail+kj, kj=kj, ki=ki
|
||||
))
|
||||
dest.write("ai+={M_tail}{elt_size};")
|
||||
dest.write("bi+={N}{elt_size};")
|
||||
|
||||
dest.write("{index_type} ci=n_top*ldc+m_top;")
|
||||
if is_complex:
|
||||
dest.write("{result_t} Cr, Ci;")
|
||||
for ki in range( N ):
|
||||
for kj in range( M_tail ):
|
||||
if is_complex:
|
||||
if TRMM:
|
||||
dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
|
||||
dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
|
||||
else:
|
||||
dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj)
|
||||
dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj)
|
||||
dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
|
||||
dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
|
||||
dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1)
|
||||
dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0)
|
||||
dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj )
|
||||
dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj )
|
||||
else:
|
||||
op = '' if TRMM else '+'
|
||||
dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};",
|
||||
ki=ki, kj=kj, op=op, dest=ki*M_tail+kj
|
||||
)
|
||||
dest.write("m_top+={M_tail};")
|
||||
|
||||
M_tail = int(M_tail/2)
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
class Setting(object):
|
||||
def __init__( self, value, convert = None ):
|
||||
self._value = value
|
||||
self._convert = convert
|
||||
|
||||
@classmethod
|
||||
def ENUM( cls, *values ):
|
||||
def closure( values ):
|
||||
return lambda value: values[value.lower()]
|
||||
return closure( { v.lower():v for v in values } )
|
||||
|
||||
@classmethod
|
||||
def BOOL( cls, value ):
|
||||
return value.lower().startswith('t') or value == '1'
|
||||
|
||||
@property
|
||||
def value( self ):
|
||||
return self._value
|
||||
|
||||
@property
|
||||
def configurable( self ):
|
||||
return self._convert is not None
|
||||
|
||||
@value.setter
|
||||
def value( self, value ):
|
||||
self._value = self._convert( value )
|
||||
|
||||
def __str__( self ):
|
||||
return str(self._value)
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
def main():
|
||||
settings = {
|
||||
'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ),
|
||||
'M': Setting( 16, int ),
|
||||
'N': Setting( 4, int ),
|
||||
'reg_width_bits': Setting( 256, int ),
|
||||
'LMUL': Setting( 1, int ),
|
||||
'M_tail_scalar_from':Setting( 2, int ),
|
||||
'cpu': Setting( 'any', str ),
|
||||
'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ),
|
||||
'force_acc_double': Setting( False, Setting.BOOL ),
|
||||
'complex': Setting( False, Setting.BOOL ),
|
||||
'conjugate': Setting( False, Setting.BOOL ),
|
||||
'index_type': Setting( 'BLASLONG', str ),
|
||||
'trace': Setting( False, Setting.BOOL ),
|
||||
'output': Setting( None, str ),
|
||||
'tail_policy': Setting( '_ta', str ),
|
||||
}
|
||||
|
||||
for item in sys.argv[1:]:
|
||||
try:
|
||||
name, value = tuple(item.split( '=', 1 ))
|
||||
except:
|
||||
ERROR("couldn't parse {}, expected arguments of the form name=value".format(item))
|
||||
|
||||
if name not in settings:
|
||||
ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name )
|
||||
+"options (and current defaults) are\n{}".format(
|
||||
" ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()]))
|
||||
)
|
||||
|
||||
try:
|
||||
settings[name].value = value
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
ERROR("couldn't parse {}".format(item))
|
||||
|
||||
if settings['output'].value is None:
|
||||
if settings['complex'].value:
|
||||
prefix = 'z' if settings['param_precision'].value == 'double' else 'c'
|
||||
else:
|
||||
prefix = 'd' if settings['param_precision'].value == 'double' else 's'
|
||||
settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format(
|
||||
prefix,
|
||||
settings['op'],
|
||||
settings['M'],
|
||||
settings['N'],
|
||||
settings['cpu']
|
||||
))
|
||||
|
||||
if settings['param_precision'].value == 'double':
|
||||
settings['param_scalar_t'] = Setting( 'double' )
|
||||
settings['ELEN_PARAM'] = Setting(64)
|
||||
else:
|
||||
settings['param_scalar_t'] = Setting( 'float' )
|
||||
settings['ELEN_PARAM'] = Setting(32)
|
||||
|
||||
settings['VFMUL'] = Setting( 'vfmul_vf_f{}m{}{}'.format(settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
|
||||
settings['VFMACC'] = Setting( 'vfmacc_vf_f{}m{}{}'.format(settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
|
||||
|
||||
settings['ELEN_ACC'] = settings['ELEN_PARAM']
|
||||
settings['LMUL_ACC'] = Setting(settings['LMUL'].value)
|
||||
widen = ''
|
||||
|
||||
if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'):
|
||||
settings['ELEN_ACC'] = Setting(64)
|
||||
settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2)
|
||||
settings['VFNCVT'] = Setting('vfncvt_f_f_w_f{}m{}{}'.format(settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']))
|
||||
widen = 'w'
|
||||
|
||||
settings['VMUL_TO_ACC'] = Setting( 'vf{}mul_vf_f{}m{}{}'.format(widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
|
||||
settings['VMACC_TO_ACC'] = Setting( 'vf{}macc_vf_f{}m{}{}'.format(widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
|
||||
|
||||
settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC']))
|
||||
settings['VLEV'] =Setting('vle{}_v_f{}m{}'.format(settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VSEV'] =Setting('vse{}_v_f{}m{}'.format(settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VLSEV'] =Setting('vlse{}_v_f{}m{}'.format(settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VSSEV'] =Setting('vsse{}_v_f{}m{}'.format(settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
|
||||
settings['VSETVL'] =Setting('vsetvl_e{}m{}'.format(settings['ELEN_PARAM'], settings['LMUL']))
|
||||
|
||||
|
||||
to_stdout = (settings['output'].value == '-')
|
||||
if not to_stdout:
|
||||
print("Writing {}".format(settings['output'].value), file=sys.stderr)
|
||||
|
||||
with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file:
|
||||
def OUTPUT(*args, **kwargs):
|
||||
print(*args, file=destination_file, **kwargs)
|
||||
|
||||
OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable])))
|
||||
OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable])))
|
||||
|
||||
OUTPUT('#include "common.h"')
|
||||
OUTPUT("\n")
|
||||
|
||||
if settings['op'].value in ('gemm', 'trmm'):
|
||||
generate_gemm_kernel(settings, OUTPUT)
|
||||
else:
|
||||
ERROR("unsupported kernel type {}".format(settings['op']))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -27,118 +27,113 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFABS_FLOAT vfabs_v_f64m8
|
||||
#define VCOMPRESS vcompress_vm_u64m8
|
||||
#define VMV_X vmv_x_s_u64m8_u64
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFABS_FLOAT vfabs_v_f32m8
|
||||
#define VCOMPRESS vcompress_vm_u32m8
|
||||
#define VMV_X vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT maxf=0.0;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
v_max = VFMVVF_FLOAT(-1, gvl);
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_max = VLEV_FLOAT(&x[j], gvl);
|
||||
v_max = VFABS_FLOAT(v_max, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
FLOAT cur_maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
|
@ -146,13 +141,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
unsigned int idx = 0, inc_v = gvl * inc_x;
|
||||
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
v_max = VFMVVF_FLOAT(-1, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
|
@ -164,33 +157,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
v_max = VFABS_FLOAT(v_max, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
FLOAT cur_maxf = *((FLOAT*)&v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(max_index+1);
|
||||
return(max_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -31,85 +31,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VMFGTVV_FLOAT vmfgt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFABS_FLOAT vfabs_v_f64m8
|
||||
#define VCOMPRESS vcompress_vm_u64m8
|
||||
#define VMV_X vmv_x_s_u64m8_u64
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VMFGTVV_FLOAT vmfgt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFABS_FLOAT vfabs_v_f32m8
|
||||
#define VCOMPRESS vcompress_vm_u32m8
|
||||
#define VMV_X vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT minf=FLT_MAX;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
FLOAT minf=FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
|
@ -117,29 +111,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_min = VLEV_FLOAT(&x[j], gvl);
|
||||
v_min = VFABS_FLOAT(v_min, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
if(cur_minf < minf){
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf > minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
|
@ -151,12 +145,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
|
@ -165,33 +157,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(vx, 0, gvl);
|
||||
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
|
||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
v_min = VFABS_FLOAT(v_min, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
if(cur_minf < minf){
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf > minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(min_index+1);
|
||||
return(min_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -31,13 +31,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
|
@ -45,22 +43,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VCOMPRESS vcompress_vm_u64m8
|
||||
#define VMV_X vmv_x_s_u64m8_u64
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
|
@ -68,31 +66,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VCOMPRESS vcompress_vm_u32m8
|
||||
#define VMV_X vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_min;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
|
@ -104,32 +102,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_max = VLEV_FLOAT(&x[j], gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
FLOAT cur_maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
|
@ -145,37 +145,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
FLOAT cur_maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask,gvl);
|
||||
max_index = *((unsigned int*)&v_max_index+max_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(max_index+1);
|
||||
return(max_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -31,122 +31,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VMFGTVV_FLOAT vmfgt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VCOMPRESS vcompress_vm_u64m8
|
||||
#define VMV_X vmv_x_s_u64m8_u64
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VMFGTVV_FLOAT vmfgt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VCOMPRESS vcompress_vm_u32m8
|
||||
#define VMV_X vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT minf=FLT_MAX;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
FLOAT minf=FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_min = VLEV_FLOAT(&x[j], gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
if(cur_minf < minf){
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf > minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
|
@ -159,59 +142,39 @@ asm volatile(
|
|||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
|
||||
//index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, gvl);
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = *((FLOAT*)&v_res);
|
||||
if(cur_minf < minf){
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf > minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask,gvl);
|
||||
min_index = *((unsigned int*)&v_min_index+min_index);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(min_index+1);
|
||||
return(min_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -27,25 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VSEVU_UINT vse64_v_u64m8
|
||||
#define UINT_T long unsigned int
|
||||
|
@ -53,27 +51,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFABS_FLOAT vfabs_v_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VCOMPRESS vcompress_vm_u64m8
|
||||
#define VMV_X vmv_x_s_u64m8_u64
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define UINT_T unsigned int
|
||||
#define VSEVU_UINT vse32_v_u32m8
|
||||
|
@ -81,187 +78,81 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFABS_FLOAT vfabs_v_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VCOMPRESS vcompress_vm_u32m8
|
||||
#define VMV_X vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
#define RVV_M RVV_M8
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT maxf=0.0;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
FLOAT maxf=-FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_max;
|
||||
FLOAT_V_T vx, vx2, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask0, mask1;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
gvl = VSETVL(n);
|
||||
UINT_T temp_uint[gvl];
|
||||
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
unsigned int idx = 0, inc_v = gvl * inc_x * 2;
|
||||
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
v_max = VFMVVF_FLOAT(-1, gvl);
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = gvl * inc_x * 2;
|
||||
BLASLONG ix = 0;
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
vx = VFADDVV_FLOAT(vx, vx2, gvl);
|
||||
|
||||
|
||||
//index where element greater than v_max
|
||||
mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_max_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_max_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl);
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, gvl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
idx += inc_v;
|
||||
}
|
||||
vx0 = VFMVVF_FLOAT(0, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
max_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_max_index,gvl);
|
||||
max_index = temp_uint[max_index];
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_max_index = VMVVX_UINT(0, gvl);
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
|
||||
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
v_max = VFABS_FLOAT(v_max, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
v_max = VFADDVV_FLOAT(v_max, vx2, gvl);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
|
||||
|
||||
if(cur_maxf > maxf){
|
||||
//tail index
|
||||
v_max_index = VIDV_UINT(gvl);
|
||||
v_max_index = VADDVX_UINT(v_max_index, j, gvl);
|
||||
|
||||
mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
max_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_max_index,gvl);
|
||||
max_index = temp_uint[max_index];
|
||||
|
||||
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_max_index, gvl);
|
||||
max_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
return(max_index+1);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -32,21 +32,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(DOUBLE)
|
||||
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VMFGTVV_FLOAT vmfgt_vv_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VMFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VSEVU_UINT vse64_v_u64m8
|
||||
#define UINT_T long unsigned int
|
||||
|
@ -54,27 +51,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFABS_FLOAT vfabs_v_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VCOMPRESS vcompress_vm_u64m8
|
||||
#define VMV_X vmv_x_s_u64m8_u64
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VMFGTVV_FLOAT vmfgt_vv_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VMFIRSTM vmfirst_m_b4
|
||||
#define VMFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define UINT_T unsigned int
|
||||
#define VSEVU_UINT vse32_v_u32m8
|
||||
|
@ -82,184 +78,81 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFABS_FLOAT vfabs_v_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VCOMPRESS vcompress_vm_u32m8
|
||||
#define VMV_X vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
FLOAT minf=FLT_MAX;
|
||||
BLASLONG i=0, j=0;
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
FLOAT minf=FLT_MAX;
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_min;
|
||||
FLOAT_V_T vx, vx2, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask0, mask1;
|
||||
MASK_T mask;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
gvl = VSETVL(n);
|
||||
UINT_T temp_uint[gvl];
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = gvl * inc_x * 2;
|
||||
BLASLONG ix = 0;
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
unsigned int idx = 0, inc_v = gvl * inc_x * 2;
|
||||
|
||||
//index where element less than v_min
|
||||
mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e64,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1 \n\t"
|
||||
"vsetvli x0, %2, e32,m8 \n\t"
|
||||
"vid.v %0, v0.t \n\t"
|
||||
:"+v"(v_min_index)
|
||||
:"v"(mask0), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
for(i=0,j=0; i < n/gvl; i++){
|
||||
vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
vx = VFABS_FLOAT(vx, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
vx = VFADDVV_FLOAT(vx, vx2, gvl);
|
||||
|
||||
|
||||
//index where element greater than v_min
|
||||
mask = VMFGTVV_FLOAT(v_min, vx, gvl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, gvl);
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
idx += inc_v;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = VFMVFS_FLOAT(v_res);
|
||||
mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
min_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_min_index,gvl);
|
||||
min_index = temp_uint[min_index];
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v_min_index = VMVVX_UINT(0, gvl);
|
||||
vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
|
||||
vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx0)
|
||||
:"v"(mask0), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
|
||||
vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
|
||||
/*
|
||||
#if defined(DOUBLE)
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e64,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#else
|
||||
asm volatile(
|
||||
"vor.vv v0, %1, %1\n\t"
|
||||
"vsetvli x0, %3, e32,m8 \n\t"
|
||||
"vfrsub.vf %0, %0, %2, v0.t \n\t"
|
||||
:"+v"(vx1)
|
||||
:"v"(mask1), "f"(zero), "r"(gvl)
|
||||
:"v0");
|
||||
#endif
|
||||
*/
|
||||
v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
FLOAT cur_minf = VFMVFS_FLOAT(v_res);
|
||||
if(cur_minf < minf){
|
||||
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
v_min = VFABS_FLOAT(v_min, gvl);
|
||||
vx2 = VFABS_FLOAT(vx2, gvl);
|
||||
v_min = VFADDVV_FLOAT(v_min, vx2, gvl);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
FLOAT cur_minf = EXTRACT_FLOAT(v_res);
|
||||
if(cur_minf > minf){
|
||||
//tail index
|
||||
v_min_index = VIDV_UINT(gvl);
|
||||
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
|
||||
|
||||
mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
min_index = VMFIRSTM(mask0,gvl);
|
||||
VSEVU_UINT(temp_uint,v_min_index,gvl);
|
||||
min_index = temp_uint[min_index];
|
||||
|
||||
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
|
||||
UINT_V_T compressed;
|
||||
compressed = VCOMPRESS(mask, compressed, v_min_index, gvl);
|
||||
min_index = VMV_X(compressed);
|
||||
}
|
||||
}
|
||||
return(min_index+1);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -28,30 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDMAXVS_FLOAT JOIN(vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VMFLTVF_FLOAT JOIN(vmflt_vf_f, ELEN, LMUL, _b, MLEN)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define VFMAXVV_FLOAT JOIN(vfmax, _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
@ -59,10 +62,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT maxf=-FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_max;
|
||||
FLOAT_V_T_M1 v_res, v_min;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
|
@ -76,15 +77,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
|
@ -102,18 +100,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
j += gvl * 2;
|
||||
idx += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
|
||||
if(*((FLOAT*)&v_res) > maxf)
|
||||
maxf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
|
|
|
@ -28,30 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDMINVS_FLOAT JOIN(vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VMFLTVF_FLOAT JOIN(vmflt_vf_f, ELEN, LMUL, _b, MLEN)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define VFRSUBVF_MASK_FLOAT JOIN(vfrsub,_vf_f, ELEN, LMUL, _m)
|
||||
#define VFMINVV_FLOAT JOIN(vfmin, _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
@ -59,10 +63,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT minf=FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_min;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
|
@ -76,15 +78,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
v_min = VFMINVV_FLOAT(v_min, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
|
@ -102,18 +101,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
j += gvl * 2;
|
||||
idx += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
|
||||
if(*((FLOAT*)&v_res) < minf)
|
||||
minf = *((FLOAT*)&v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
return(minf);
|
||||
}
|
||||
|
||||
|
|
|
@ -26,207 +26,180 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4
|
||||
#define ABS fabsf
|
||||
#define MASK_T vbool8_t
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
|
||||
#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VFDIVVF_FLOAT vfdiv_vf_f32m4
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
|
||||
|
||||
#define LMUL m1
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 16
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4
|
||||
#define ABS fabs
|
||||
#define MASK_T vbool16_t
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
|
||||
#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
|
||||
#define VMFIRSTM vmfirst_m_b16
|
||||
#define VFDIVVF_FLOAT vfdiv_vf_f64m4
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
|
||||
# define ELEN 32
|
||||
# define MLEN 8
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVSF_FLOAT JOIN(vfmv, _s_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VFABS JOIN(vfabs, _v_f, ELEN, LMUL, _)
|
||||
#define VMFNE JOIN(vmfne_vf_f,ELEN, LMUL, _b, MLEN)
|
||||
#define VMFGT JOIN(vmfgt_vv_f,ELEN, LMUL, _b, MLEN)
|
||||
#define VMFEQ JOIN(vmfeq_vf_f,ELEN, LMUL, _b, MLEN)
|
||||
#define VCPOP JOIN(vcpop, _m_b, MLEN, _, _)
|
||||
#define VFREDMAX JOIN(vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
|
||||
#define VFREDMIN JOIN(vfredmin_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
|
||||
#define VFIRST JOIN(vfirst, _m_b, MLEN, _, _)
|
||||
#define VRGATHER JOIN(vrgather, _vx_f, ELEN, LMUL, _)
|
||||
#define VFDIV JOIN(vfdiv, _vv_f, ELEN, LMUL, _)
|
||||
#define VFDIV_M JOIN(vfdiv, _vv_f, ELEN, LMUL, _m)
|
||||
#define VFMUL JOIN(vfmul, _vv_f, ELEN, LMUL, _)
|
||||
#define VFMUL_M JOIN(vfmul, _vv_f, ELEN, LMUL, _m)
|
||||
#define VFMACC JOIN(vfmacc, _vv_f, ELEN, LMUL, _)
|
||||
#define VFMACC_M JOIN(vfmacc, _vv_f, ELEN, LMUL, _m)
|
||||
#define VMSBF JOIN(vmsbf, _m_b, MLEN, _, _)
|
||||
#define VMSOF JOIN(vmsof, _m_b, MLEN, _, _)
|
||||
#define VMAND JOIN(vmand, _mm_b, MLEN, _, _)
|
||||
#define VMANDN JOIN(vmandn, _mm_b, MLEN, _, _)
|
||||
#define VFREDSUM JOIN(vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
|
||||
#define VMERGE JOIN(vmerge, _vvm_f, ELEN, LMUL, _)
|
||||
|
||||
#define VSEV_FLOAT JOIN(vse, ELEN, _v_f, ELEN, LMUL)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define EXTRACT_FLOAT0_V(v) JOIN(vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
|
||||
|
||||
//#define DUMP( label, v0, gvl )
|
||||
#define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG i=0;
|
||||
|
||||
if ( n < 0 ) return(0.0);
|
||||
if(n <= 0) return(0.0);
|
||||
if(n == 1) return (ABS(x[0]));
|
||||
|
||||
FLOAT_V_T vr, v0, v_zero;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
|
||||
FLOAT scale = 0.0, ssq = 0.0;
|
||||
MASK_T mask;
|
||||
BLASLONG index = 0;
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
//ssq in vector vr
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
j += gvl;
|
||||
}
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0)
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}else{//found greater element
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
int idx = 0, inc_v = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
//ssq in vector vr
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
MASK_T nonzero_mask;
|
||||
MASK_T scale_mask;
|
||||
BLASLONG scale_index = 0;
|
||||
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0)
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}else{//found greater element
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOATM4(vr);
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
gvl = VSETVL(n);
|
||||
FLOAT_V_T v0;
|
||||
FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
|
||||
FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
|
||||
FLOAT_V_T v_one = VFMVVF_FLOAT(1, gvl);
|
||||
|
||||
FLOAT scale = 0;
|
||||
FLOAT ssq = 0;
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
int idx = 0;
|
||||
|
||||
if( n >= gvl ) // don't pay overheads if we're not doing useful work
|
||||
{
|
||||
for(i=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
|
||||
nonzero_mask = VMFNE( v0, 0, gvl );
|
||||
v0 = VFABS( v0, gvl );
|
||||
scale_mask = VMFGT( v0, v_scale, gvl );
|
||||
|
||||
// assume scale changes are relatively infrequent
|
||||
|
||||
// unclear if the vcpop+branch is actually a win
|
||||
// since the operations being skipped are predicated anyway
|
||||
// need profiling to confirm
|
||||
if( VCPOP(scale_mask, gvl) )
|
||||
{
|
||||
v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
|
||||
v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
|
||||
v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
|
||||
v_scale = VMERGE( scale_mask, v_scale, v0, gvl );
|
||||
}
|
||||
v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
|
||||
v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
|
||||
idx += inc_x * gvl;
|
||||
}
|
||||
|
||||
// we have gvl elements which we accumulated independently, with independent scales
|
||||
// we need to combine these
|
||||
// naive sort so we process small values first to avoid losing information
|
||||
// could use vector sort extensions where available, but we're dealing with gvl elts at most
|
||||
|
||||
FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
|
||||
FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
|
||||
VSEV_FLOAT( out_ssq, v_ssq, gvl );
|
||||
VSEV_FLOAT( out_scale, v_scale, gvl );
|
||||
for( int a = 0; a < (gvl-1); ++a )
|
||||
{
|
||||
int smallest = a;
|
||||
for( size_t b = a+1; b < gvl; ++b )
|
||||
if( out_scale[b] < out_scale[smallest] )
|
||||
smallest = b;
|
||||
if( smallest != a )
|
||||
{
|
||||
FLOAT tmp1 = out_ssq[a];
|
||||
FLOAT tmp2 = out_scale[a];
|
||||
out_ssq[a] = out_ssq[smallest];
|
||||
out_scale[a] = out_scale[smallest];
|
||||
out_ssq[smallest] = tmp1;
|
||||
out_scale[smallest] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int a = 0;
|
||||
while( a<gvl && out_scale[a] == 0 )
|
||||
++a;
|
||||
|
||||
if( a < gvl )
|
||||
{
|
||||
ssq = out_ssq[a];
|
||||
scale = out_scale[a];
|
||||
++a;
|
||||
for( ; a < gvl; ++a )
|
||||
{
|
||||
ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
|
||||
scale = out_scale[a];
|
||||
}
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
|
||||
//finish any tail using scalar ops
|
||||
i*=gvl*inc_x;
|
||||
n*=inc_x;
|
||||
while(i < n){
|
||||
if ( x[i] != 0.0 ){
|
||||
FLOAT absxi = ABS( x[i] );
|
||||
if ( scale < absxi ){
|
||||
ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
|
||||
scale = absxi ;
|
||||
}
|
||||
else{
|
||||
ssq += ( absxi/scale ) * ( absxi/scale );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
i += inc_x;
|
||||
}
|
||||
|
||||
return(scale * sqrt(ssq));
|
||||
}
|
||||
|
||||
|
|
|
@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
|
@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
|
|
|
@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m4
|
||||
|
@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m4
|
||||
|
@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if(n <= 0) return(0);
|
||||
unsigned int gvl = 0;
|
||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
|
||||
FLOAT_V_T v0, v1, vx, vy;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
vx = VLEV_FLOAT(&x[j], gvl);
|
||||
vy = VLEV_FLOAT(&y[j], gvl);
|
||||
|
@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
VSEV_FLOAT(&y[j], v1, gvl);
|
||||
}
|
||||
}else if(inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
|
@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
VSEV_FLOAT(&y[j], v1, gvl);
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG inc_yv = inc_y * gvl;
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
|
@ -154,7 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
|
|
|
@ -26,28 +26,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VSEV_FLOAT vse_v_f32m8
|
||||
#define VSSEV_FLOAT vsse_v_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VSEV_FLOAT vse_v_f64m8
|
||||
#define VSSEV_FLOAT vsse_v_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(vse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(vsse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMULVF_FLOAT JOIN(vfmul, _vf_f, ELEN, LMUL, _)
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
|
@ -84,25 +86,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
}
|
||||
}else{
|
||||
if(da == 0.0){
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG ix = 0;
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG ix = 0;
|
||||
if(gvl <= n / 2){
|
||||
long int inc_xv = gvl * inc_x;
|
||||
v0 = VFMVVF_FLOAT(0, gvl);
|
||||
for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl);
|
||||
ix += inc_xv * 2;
|
||||
}
|
||||
v0 = VFMVVF_FLOAT(0, gvl);
|
||||
|
||||
for(i = 0; i < n/(gvl*2); ++i ){
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
ix += inc_x * gvl;
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
ix += inc_x * gvl;
|
||||
}
|
||||
//tail
|
||||
for(; j <n; ){
|
||||
gvl = VSETVL(n-j);
|
||||
|
||||
i *= gvl*2;
|
||||
while( i < n ){
|
||||
gvl = VSETVL(n-i);
|
||||
v0 = VFMVVF_FLOAT(0, gvl);
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
j += gvl;
|
||||
ix += inc_x * gvl;
|
||||
VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
|
||||
i += gvl;
|
||||
ix += inc_x * gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,114 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#endif
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT asumf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_sum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
|
||||
if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
for(i=0,j=0; i<n/(gvl*2); i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
inc_xv += inc_xv * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n;){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}
|
||||
asumf = EXTRACT_FLOAT(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
|
||||
|
|
@ -27,35 +27,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
#include <stdio.h>
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VSEV_FLOAT vse_v_f32m8
|
||||
#define VSSEV_FLOAT vsse_v_f32m8
|
||||
|
||||
#define LMUL m8
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VSEV_FLOAT vse_v_f64m8
|
||||
#define VSSEV_FLOAT vsse_v_f64m8
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(vse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(vsse, ELEN, _v_f, ELEN, LMUL)
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i = 0, j = 0;
|
||||
BLASLONG ix = 0,iy = 0;
|
||||
BLASLONG stride_x, stride_y;
|
||||
FLOAT_V_T vx0, vx1, vy0, vy1;
|
||||
unsigned int gvl = 0;
|
||||
|
||||
if (n < 0) return(0);
|
||||
|
||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
|
||||
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
|
||||
|
||||
if(inc_x == 1 && inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
for(i=0,j=0; i<n/(2*gvl); i++){
|
||||
vx0 = VLEV_FLOAT(&x[j], gvl);
|
||||
|
@ -79,7 +85,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
j+=gvl;
|
||||
}
|
||||
}else if (inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG inc_xv = inc_x * gvl;
|
||||
|
@ -107,7 +112,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
ix += inc_x * gvl;
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
gvl = VSETVL(n);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
BLASLONG inc_yv = inc_y * gvl;
|
||||
|
@ -135,7 +139,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
iy += inc_y * gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
if(gvl <= n/2){
|
||||
|
|
|
@ -31,11 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
|
@ -47,11 +46,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
|
@ -100,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
i += gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
|
@ -111,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += alpha * temp2;
|
||||
|
@ -145,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
|
@ -156,7 +154,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
|
@ -191,7 +189,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
|
@ -202,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += alpha * temp2;
|
||||
|
@ -242,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < m){
|
||||
gvl = VSETVL(m-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
|
@ -253,7 +251,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += alpha * temp2;
|
||||
|
|
|
@ -31,11 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
|
@ -48,11 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
|
@ -102,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
i += gvl;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
|
@ -113,7 +111,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
|
@ -146,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
|
@ -157,7 +155,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLEV_FLOAT(&x[i], gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
|
@ -191,7 +189,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLEV_FLOAT(&y[i], gvl);
|
||||
|
@ -202,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[j] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
|
@ -241,7 +239,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 = VFMVFS_FLOAT(v_res);
|
||||
temp2 = EXTRACT_FLOAT(v_res);
|
||||
if(i < j){
|
||||
gvl = VSETVL(j-i);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
|
||||
|
@ -252,7 +250,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
|
|||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
vr = VFMULVV_FLOAT(vx, va, gvl);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
temp2 += VFMVFS_FLOAT(v_res);
|
||||
temp2 += EXTRACT_FLOAT(v_res);
|
||||
}
|
||||
}
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * temp2;
|
||||
|
|
|
@ -28,40 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
|
||||
#define LMUL m8
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDMAXVS_FLOAT JOIN(vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VMFLTVF_FLOAT JOIN(vmflt_vf_f, ELEN, LMUL, _b, MLEN)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define VFRSUBVF_MASK_FLOAT JOIN(vfrsub,_vf_f, ELEN, LMUL, _m)
|
||||
#define VFMAXVV_FLOAT JOIN(vfmax, _vv_f, ELEN, LMUL, _)
|
||||
#define VFADDVV_FLOAT JOIN(vfadd, _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
@ -70,10 +64,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_max;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
@ -94,8 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
j += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_res, gvl);
|
||||
|
||||
if(j<n){
|
||||
gvl = VSETVL(n-j);
|
||||
|
@ -106,10 +97,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
|
||||
|
||||
if(VFMVFS_FLOAT(v_res)> maxf)
|
||||
maxf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_res, gvl);
|
||||
}
|
||||
maxf = EXTRACT_FLOAT(v_res);
|
||||
return(maxf);
|
||||
}
|
||||
|
|
|
@ -29,38 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
# define ELEN 32
|
||||
# define MLEN 4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDMINVS_FLOAT JOIN(vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VMFLTVF_FLOAT JOIN(vmflt_vf_f, ELEN, LMUL, _b, MLEN)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define VFRSUBVF_MASK_FLOAT JOIN(vfrsub,_vf_f, ELEN, LMUL, _m)
|
||||
#define VFMINVV_FLOAT JOIN(vfmin, _vv_f, ELEN, LMUL, _)
|
||||
#define VFADDVV_FLOAT JOIN(vfadd, _vv_f, ELEN, LMUL, _)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
@ -69,10 +66,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT minf=FLT_MAX;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_min;
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
@ -93,8 +88,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
j += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
|
||||
minf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_res, gvl);
|
||||
|
||||
if(j<n){
|
||||
gvl = VSETVL(n-j);
|
||||
|
@ -105,9 +99,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
|
||||
if(VFMVFS_FLOAT(v_res) < minf)
|
||||
minf = VFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v1, v_res, gvl);
|
||||
}
|
||||
|
||||
minf = EXTRACT_FLOAT(v_res);
|
||||
return(minf);
|
||||
}
|
||||
|
|
|
@ -28,37 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN _b8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
# define ELEN 32
|
||||
# define MLEN _b4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDSUMVS_FLOAT JOIN(vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define VFABS_FLOAT JOIN(vfabs, _v_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define VFADDVV_FLOAT JOIN(vfadd, _vv_f, ELEN, LMUL, _)
|
||||
#define VMFLTVF_FLOAT JOIN(vmflt, _vf_f, ELEN, LMUL, MLEN)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
|
@ -67,12 +62,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_zero,v_sum;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
MASK_T mask0, mask1;
|
||||
if(inc_x == 1){
|
||||
BLASLONG n2 = n * 2;
|
||||
gvl = VSETVL(n2);
|
||||
|
@ -81,26 +73,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n2/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n2;){
|
||||
gvl = VSETVL(n2-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
|
@ -112,34 +99,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
if(j<n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
v0 = VFABS_FLOAT(v0, gvl);
|
||||
|
||||
mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
|
||||
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
v1 = VFABS_FLOAT(v1, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
|
||||
asumf += VFFMVFS_FLOAT(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
}
|
||||
asumf = EXTRACT_FLOAT(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
|
||||
|
|
|
@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
|
@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
|
|
|
@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
#endif
|
||||
|
|
|
@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
|
@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
#endif
|
||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
|
||||
|
@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
|
||||
|
|
|
@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
|
@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
|
|
|
@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
|
@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
|
|
|
@ -26,264 +26,140 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f32m4
|
||||
#define ABS fabsf
|
||||
#define MASK_T vbool8_t
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
|
||||
#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
|
||||
#define VMFIRSTM vmfirst_m_b8
|
||||
#define VFDIVVF_FLOAT vfdiv_vf_f32m4
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
|
||||
|
||||
#define LMUL m1
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFDOTVV_FLOAT vfdot_vv_f64m4
|
||||
#define ABS fabs
|
||||
#define MASK_T vbool16_t
|
||||
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
|
||||
#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
|
||||
#define VMFIRSTM vmfirst_m_b16
|
||||
#define VFDIVVF_FLOAT vfdiv_vf_f64m4
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
|
||||
# define ELEN 32
|
||||
# define MLEN 32
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define MASK_T JOIN(vbool, MLEN, _t, _, _)
|
||||
#define VFABS JOIN(vfabs, _v_f, ELEN, LMUL, _)
|
||||
#define VMFNE JOIN(vmfne_vf_f,ELEN, LMUL, _b, MLEN)
|
||||
#define VMFGT JOIN(vmfgt_vv_f,ELEN, LMUL, _b, MLEN)
|
||||
#define VMFEQ JOIN(vmfeq_vv_f,ELEN, LMUL, _b, MLEN)
|
||||
#define VCPOP JOIN(vcpop, _m_b, MLEN, _, _)
|
||||
#define VFREDMAX JOIN(vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
|
||||
#define VFIRST JOIN(vfirst, _m_b, MLEN, _, _)
|
||||
#define VRGATHER JOIN(vrgather, _vx_f, ELEN, LMUL, _)
|
||||
#define VFDIV JOIN(vfdiv, _vf_f, ELEN, LMUL, _)
|
||||
#define VFDIV_M JOIN(vfdiv, _vv_f, ELEN, LMUL, _m)
|
||||
#define VFMUL JOIN(vfmul, _vv_f, ELEN, LMUL, _)
|
||||
#define VFMACC JOIN(vfmacc, _vv_f, ELEN, LMUL, _)
|
||||
#define VFMACC_M JOIN(vfmacc, _vv_f, ELEN, LMUL, _m)
|
||||
#define VMSOF JOIN(vmsof, _m_b, MLEN, _, _)
|
||||
#define VMANDN JOIN(vmandn, _mm_b, MLEN, _, _)
|
||||
#define VFREDUSUM JOIN(vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1)
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#define EXTRACT_FLOAT0_V(v) JOIN(vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG i=0;
|
||||
|
||||
if ( n < 0 ) return(0.0);
|
||||
// if(n == 1) return (ABS(x[0]));
|
||||
if(n < 0) return(0.0);
|
||||
|
||||
FLOAT_V_T vr, v0, v_zero;
|
||||
FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero;
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
gvl = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, gvl);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
|
||||
|
||||
FLOAT scale = 0.0, ssq = 0.0;
|
||||
MASK_T mask;
|
||||
BLASLONG index = 0;
|
||||
if(inc_x == 1){
|
||||
BLASLONG n2 = n * 2;
|
||||
gvl = VSETVL(n2);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n2/gvl; i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
//ssq in vector vr
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
j += gvl;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
gvl = VSETVL(n);
|
||||
v_ssq = VFMVVF_FLOAT(0, gvl);
|
||||
v_scale = VFMVVF_FLOAT(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
int idx = 0;
|
||||
|
||||
for(i=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
|
||||
v1 = VLSEV_FLOAT( &x[idx+1], stride_x, gvl );
|
||||
v0 = VFABS( v0, gvl );
|
||||
v1 = VFABS( v1, gvl );
|
||||
|
||||
MASK_T scale_mask0 = VMFGT( v0, v_scale, gvl );
|
||||
MASK_T scale_mask1 = VMFGT( v1, v_scale, gvl );
|
||||
if( VCPOP( scale_mask0, gvl ) + VCPOP( scale_mask1, gvl ) > 0 ){ // scale change?
|
||||
// find largest element in v0 and v1
|
||||
v_res = VFREDMAX( v_res, v0, v_z0, gvl );
|
||||
v_res = VFREDMAX( v_res, v1, v_res, gvl );
|
||||
FLOAT const largest_elt = EXTRACT_FLOAT( v_res );
|
||||
|
||||
v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt
|
||||
v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt)
|
||||
v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt)
|
||||
|
||||
v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale
|
||||
}
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
|
||||
//tail
|
||||
if(j < n2){
|
||||
gvl = VSETVL(n2-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0)
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}else{//found greater element
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
}
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
vr = VFMVVF_FLOAT(0, gvl);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
int idx = 0, inc_v = inc_x * gvl * 2;
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
//ssq in vector vr
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl );
|
||||
MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl );
|
||||
v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl );
|
||||
v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl );
|
||||
v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl );
|
||||
v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl );
|
||||
|
||||
v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
//ssq in vector vr
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
j += gvl;
|
||||
idx += inc_v;
|
||||
}
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
|
||||
//tail
|
||||
if(j < n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
|
||||
v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
|
||||
//fabs(vector)
|
||||
mask = VMFLTVF_FLOAT(v0, 0, gvl);
|
||||
v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
|
||||
//if scale change
|
||||
mask = VMFGTVF_FLOAT(v0, scale, gvl);
|
||||
index = VMFIRSTM(mask, gvl);
|
||||
if(index == -1){//no elements greater than scale
|
||||
if(scale != 0.0){
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
|
||||
}
|
||||
}else{//found greater element
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq before current vector
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
//find max
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
|
||||
//update ssq before max_index
|
||||
ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
|
||||
//update scale
|
||||
scale = VFMVFS_FLOAT(v_res);
|
||||
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
|
||||
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
|
||||
}
|
||||
//ssq in vector vr: vr[0]
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
|
||||
//total ssq now
|
||||
ssq += VFMVFS_FLOAT(v_res);
|
||||
}
|
||||
idx += inc_x * gvl * 2;
|
||||
}
|
||||
return(scale * sqrt(ssq));
|
||||
|
||||
v_res = VFREDUSUM(v_res, v_ssq, v_z0, gvl);
|
||||
FLOAT ssq = EXTRACT_FLOAT(v_res);
|
||||
FLOAT scale = EXTRACT_FLOAT0_V(v_scale);
|
||||
|
||||
//finish any tail using scalar ops
|
||||
i*=gvl;
|
||||
if(i<n){
|
||||
i *= inc_x*2;
|
||||
n *= inc_x*2;
|
||||
FLOAT temp;
|
||||
do{
|
||||
if ( x[i] != 0.0 ){
|
||||
temp = ABS( x[i] );
|
||||
if ( scale < temp ){
|
||||
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
|
||||
scale = temp ;
|
||||
}else{
|
||||
ssq += ( temp / scale ) * ( temp / scale );
|
||||
}
|
||||
}
|
||||
|
||||
if ( x[i+1] != 0.0 ){
|
||||
temp = ABS( x[i+1] );
|
||||
if ( scale < temp ){
|
||||
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
|
||||
scale = temp ;
|
||||
}else{
|
||||
ssq += ( temp / scale ) * ( temp / scale );
|
||||
}
|
||||
}
|
||||
|
||||
i += inc_x*2;
|
||||
}while(i<n);
|
||||
}
|
||||
|
||||
return(scale * sqrt(ssq));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle_v_f32m4
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSEV_FLOAT vse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
|
@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle_v_f64m4
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSEV_FLOAT vse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
|
@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
unsigned int gvl = 0;
|
||||
|
||||
FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
|
||||
gvl = VSETVL(n);
|
||||
gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = inc_x * 2 * gvl;
|
||||
|
|
|
@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f32m4
|
||||
#define VSSEV_FLOAT vsse_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
|
@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEV_FLOAT vlse_v_f64m4
|
||||
#define VSSEV_FLOAT vsse_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define LMUL m4
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN _b8
|
||||
#else
|
||||
# define ELEN 32
|
||||
# define MLEN _b4
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFREDSUMVS_FLOAT JOIN(vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#define VFMVVF_FLOAT JOIN(vfmv, _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(vfmv, _v_f_f, ELEN, m1, _)
|
||||
#define VFADDVV_FLOAT JOIN(vfadd, _vv_f, ELEN, LMUL, _)
|
||||
#define VMFLTVF_FLOAT JOIN(vmflt, _vf_f, ELEN, LMUL, MLEN)
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0, j=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT asumf=0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
unsigned int gvl = 0;
|
||||
FLOAT_V_T v0, v1, v_zero,v_sum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
v_res = VFMVVF_FLOAT_M1(0, 1);
|
||||
|
||||
if(inc_x == 1){
|
||||
BLASLONG n2 = n * 2;
|
||||
gvl = VSETVL(n2);
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
if(gvl <= n2/2){
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n2/(gvl*2); i++){
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLEV_FLOAT(&x[j+gvl], gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
j += gvl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
for(;j<n2;){
|
||||
gvl = VSETVL(n2-j);
|
||||
v0 = VLEV_FLOAT(&x[j], gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_res, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
v_zero = VFMVVF_FLOAT(0, gvl);
|
||||
|
||||
BLASLONG inc_xv = inc_x * 2 * gvl;
|
||||
v_sum = VFMVVF_FLOAT(0, gvl);
|
||||
for(i=0,j=0; i<n/gvl; i++){
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
|
||||
|
||||
j += gvl;
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
if(j<n){
|
||||
gvl = VSETVL(n-j);
|
||||
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
|
||||
|
||||
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
|
||||
v_sum = VFADDVV_FLOAT(v0, v1, gvl);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_res, gvl);
|
||||
}
|
||||
}
|
||||
asumf = EXTRACT_FLOAT(v_res);
|
||||
return(asumf);
|
||||
}
|
||||
|
||||
|
|
@ -27,35 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
#include <stdio.h>
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle_v_f32m8
|
||||
#define VLSEV_FLOAT vlse_v_f32m8
|
||||
#define VSEV_FLOAT vse_v_f32m8
|
||||
#define VSSEV_FLOAT vsse_v_f32m8
|
||||
|
||||
#define LMUL m8
|
||||
#if defined(DOUBLE)
|
||||
# define ELEN 64
|
||||
# define MLEN 16
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle_v_f64m8
|
||||
#define VLSEV_FLOAT vlse_v_f64m8
|
||||
#define VSEV_FLOAT vse_v_f64m8
|
||||
#define VSSEV_FLOAT vsse_v_f64m8
|
||||
# define ELEN 32
|
||||
# define MLEN 8
|
||||
#endif
|
||||
|
||||
#define _
|
||||
#define JOIN2_X(x, y) x ## y
|
||||
#define JOIN2(x, y) JOIN2_X(x, y)
|
||||
#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
|
||||
|
||||
#define VSETVL JOIN(vsetvl, _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define VLEV_FLOAT JOIN(vle, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(vlse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(vse, ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(vsse, ELEN, _v_f, ELEN, LMUL)
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i = 0, j = 0;
|
||||
BLASLONG ix = 0,iy = 0;
|
||||
BLASLONG stride_x, stride_y;
|
||||
FLOAT_V_T vx0, vx1, vy0, vy1;
|
||||
unsigned int gvl = 0;
|
||||
unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
|
||||
if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
|
||||
|
||||
if (n < 0) return(0);
|
||||
if(inc_x == 1 && inc_y == 1){
|
||||
gvl = VSETVL(n);
|
||||
BLASLONG n2 = n * 2;
|
||||
if(gvl <= n2/2){
|
||||
for(i=0,j=0; i<n2/(2*gvl); i++){
|
||||
|
@ -80,7 +84,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
|
|||
j += gvl;
|
||||
}
|
||||
}else{
|
||||
gvl = VSETVL(n);
|
||||
stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
BLASLONG inc_xv = inc_x * gvl * 2;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue