diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index a1dbdb0e4..d7fb86eab 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * { FLOAT_V_T vx, vy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( beta == 0.0 ) { if ( alpha == 0.0 ) { @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_y = inc_y * sizeof(FLOAT); size_t vl = VSETVL(n); vy = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) { + for ( ; n > 0; n -= vl, y += vl*inc_y) { vl = VSETVL(n); VSSEV_FLOAT(y, stride_y, vy, vl); } @@ -126,10 +126,12 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } else { if ((1 == inc_x) && (1 == inc_y)) { - for (size_t vl; n > 0; n -= vl, y += vl) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); vy = VLEV_FLOAT(y, vl); vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); VSEV_FLOAT (y, vy, vl); } } else if (1 == inc_x) { diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c index 041fd2dae..9d4b84095 100644 --- a/kernel/riscv64/copy_rvv.c +++ b/kernel/riscv64/copy_rvv.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); FLOAT_V_T v0; diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c index 3030d67fb..c652ab0c0 100644 --- a/kernel/riscv64/gemm_ncopy_8_rvv.c +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -30,19 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/gemm_ncopy_8.c @@ -57,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) FLOAT *b_offset; FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -87,7 +103,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v7 = VLEV_FLOAT(a_offset7, vl); v8 = VLEV_FLOAT(a_offset8, vl); - VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); a_offset1 += vl; a_offset2 += vl; @@ -116,7 +141,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v3 = VLEV_FLOAT(a_offset3, vl); v4 = VLEV_FLOAT(a_offset4, vl); - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); a_offset1 += vl; a_offset2 += vl; @@ -137,7 +167,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v1 = VLEV_FLOAT(a_offset1, vl); v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += vl; a_offset2 += vl; diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 080a87312..4742ae6a7 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VLSEV_FLOAT __riscv_vlse32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VLSEV_FLOAT __riscv_vlse64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 8; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 8; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 32; @@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 8; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 16; @@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e93f0056c..32f66a7a7 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -32,10 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -61,10 +63,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -93,6 +97,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(max_index); FLOAT_V_T vx0, vx1, v_max; + FLOAT_VX2_T vxx2; UINT_V_T v_max_index; MASK_T mask; @@ -107,7 +112,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -129,7 +137,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index b5bc27404..d34b220fa 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -60,8 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -90,6 +94,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(min_index); FLOAT_V_T vx0, vx1, v_min; + FLOAT_VX2_T vxx2; UINT_V_T v_min_index; MASK_T mask; @@ -104,7 +109,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -127,7 +135,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 994fadb70..3eb423849 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -26,78 +26,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include -#if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m8 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define ABS fabsf -#else -#define VSETVL(n) __riscv_vsetvl_e64m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m8 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#if defined(DOUBLE) +#define VSETVL __riscv_vsetvl_e64m4 +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define MASK_T vbool16_t +#define VFABS __riscv_vfabs_v_f64m4 +#define VMFNE __riscv_vmfne_vf_f64m4_b16 +#define VMFGT __riscv_vmfgt_vv_f64m4_b16 +#define VMFEQ __riscv_vmfeq_vf_f64m4_b16 +#define VCPOP __riscv_vcpop_m_b16 +#define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1 +#define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1 +#define VFIRST __riscv_vfirst_m_b16 +#define VRGATHER __riscv_vrgather_vx_f64m4 +#define VFDIV __riscv_vfdiv_vv_f64m4 +#define VFDIV_M __riscv_vfdiv_vv_f64m4_mu +#define VFMUL __riscv_vfmul_vv_f64m4 +#define VFMUL_M __riscv_vfmul_vv_f64m4_mu +#define VFMACC __riscv_vfmacc_vv_f64m4 +#define VFMACC_M __riscv_vfmacc_vv_f64m4_mu +#define VMSBF __riscv_vmsbf_m_b16 +#define VMSOF __riscv_vmsof_m_b16 +#define VMAND __riscv_vmand_mm_b16 +#define VMANDN __riscv_vmand_mm_b16 +#define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1 +#define VMERGE __riscv_vmerge_vvm_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v) #define ABS fabs +#else +#define VSETVL __riscv_vsetvl_e32m4 +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define MASK_T vbool8_t +#define VFABS __riscv_vfabs_v_f32m4 +#define VMFNE __riscv_vmfne_vf_f32m4_b8 +#define VMFGT __riscv_vmfgt_vv_f32m4_b8 +#define VMFEQ __riscv_vmfeq_vf_f32m4_b8 +#define VCPOP __riscv_vcpop_m_b8 +#define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1 +#define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1 +#define VFIRST __riscv_vfirst_m_b8 +#define VRGATHER __riscv_vrgather_vx_f32m4 +#define VFDIV __riscv_vfdiv_vv_f32m4 +#define VFDIV_M __riscv_vfdiv_vv_f32m4_mu +#define VFMUL __riscv_vfmul_vv_f32m4 +#define VFMUL_M __riscv_vfmul_vv_f32m4_mu +#define VFMACC __riscv_vfmacc_vv_f32m4 +#define VFMACC_M __riscv_vfmacc_vv_f32m4_mu +#define VMSBF __riscv_vmsbf_m_b8 +#define VMSOF __riscv_vmsof_m_b8 +#define VMAND __riscv_vmand_mm_b8 +#define VMANDN __riscv_vmand_mm_b8 +#define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1 +#define VMERGE __riscv_vmerge_vvm_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v) +#define ABS fabsf #endif - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i=0; - if( n <= 0 ) return(0.0); - if(n == 1) return (ABS(x[0])); + if (n <= 0 || inc_x <= 0) return(0.0); + if(n == 1) return (ABS(x[0])); - FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_res; - FLOAT ssq = 0.0; + unsigned int gvl = 0; - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); + MASK_T nonzero_mask; + MASK_T scale_mask; - vr = VFMVVF_FLOAT(0, vlmax); - - if(inc_x == 1) { + gvl = VSETVL(n); + FLOAT_V_T v0; + FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); + FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); + FLOAT scale = 0; + FLOAT ssq = 0; + unsigned int stride_x = inc_x * sizeof(FLOAT); + int idx = 0; - v0 = VLEV_FLOAT(x, vl); + if( n >= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i 0; n -= vl, x += vl * inc_x) { - vl = VSETVL(n); + } - v0 = VLSEV_FLOAT(x, stride_x, vl); - - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + i += inc_x; } - } - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); - - ssq = VFMVFS_FLOAT_M1(v_res); - - return sqrt(ssq); + return(scale * sqrt(ssq)); } + + diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index 2e2cfd31e..2c273fb63 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() #define FLOAT_V_T vfloat32m8_t #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 @@ -38,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() #define FLOAT_V_T vfloat64m8_t #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 @@ -54,26 +56,41 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T v0; if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); - - v0 = VLEV_FLOAT(x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSEV_FLOAT(x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + VSEV_FLOAT(x, v0, vl); + } } - - } else { + else { + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + } + } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); - for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { - vl = VSETVL(n); - - v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSSEV_FLOAT(x, stride_x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } } - } return 0; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 3cfd3ee4c..bcd2f6981 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; - + BLASLONG m1 = m - offset; if(inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 886af0c3b..869561fb3 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -140,6 +144,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; a += (m - 1) * m * 2; @@ -153,7 +158,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -165,12 +172,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = 0; k < i; k ++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -182,7 +193,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index ddeef966c..da443cfba 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -137,6 +141,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; @@ -149,7 +154,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -161,12 +168,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = i + 1; k < m; k++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -178,7 +189,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 4c83bbaa3..32e481036 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -31,13 +31,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +46,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +135,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -147,7 +150,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -159,13 +164,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = i + 1; k < n; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -177,7 +186,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index b368eefb9..81cc41818 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -42,10 +45,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +139,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -149,7 +156,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B pcj = c; for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -161,13 +170,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = 0; k < i; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -179,7 +192,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index bbb1e876b..180cf059a 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmax; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -95,7 +103,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index c5453121b..56a467502 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmin; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -94,7 +102,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index e0da55311..66e38c1e4 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -35,6 +35,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 @@ -42,13 +45,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 @@ -56,10 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) @@ -74,6 +80,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG stride_x = inc_x2 * sizeof(FLOAT); BLASLONG stride_y = inc_y2 * sizeof(FLOAT); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if ( beta_r == 0.0 && beta_i == 0.0) { @@ -81,10 +88,12 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL { size_t vl = VSETVL(n); FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) + vxx2 = VSET_VX2(vxx2, 0, temp); + vxx2 = VSET_VX2(vxx2, 1, temp); + for ( ; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); } } else @@ -92,7 +101,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); @@ -100,20 +112,26 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } } else { FLOAT_V_T v0, v1; + FLOAT_VX2_T v_x2; if ( alpha_r == 0.0 && alpha_i == 0.0 ) { for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vy1, beta_i, vl); v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); @@ -121,7 +139,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMULVF_FLOAT(vy1, beta_r, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } else @@ -129,8 +149,14 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); @@ -142,7 +168,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } } diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c index 3f75898e0..0db32df10 100644 --- a/kernel/riscv64/zaxpy_rvv.c +++ b/kernel/riscv64/zaxpy_rvv.c @@ -30,19 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -53,14 +59,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if(da_r == 0.0 && da_i == 0.0) return(0); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if(inc_x == 1 && inc_y == 1) { for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); @@ -72,7 +85,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1) { @@ -82,8 +97,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -96,7 +116,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1) { @@ -106,8 +128,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -120,7 +147,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -131,8 +160,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -145,7 +179,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c index bd94810ce..13879f03b 100644 --- a/kernel/riscv64/zcopy_rvv.c +++ b/kernel/riscv64/zcopy_rvv.c @@ -34,11 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 #define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T_M4 vfloat32m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T_M4 vfloat32m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T_M8 vfloat64m8_t @@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 #define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T_M4 vfloat64m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T_M4 vfloat64m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); if(inc_x == 1 && inc_y == 1) { @@ -70,34 +70,34 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) }else if (1 == inc_x) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSEG_FLOAT_M4(&vr, &vi, x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSEG_FLOAT_M4(x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } else if (1 == inc_y) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSEG_FLOAT_M4(y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSEG_FLOAT_M4(y, vx2, vl); } } else { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index fa0e89353..13bc2ee39 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -71,6 +75,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vyx2; size_t vlmax_m1 = VSETVL_MAX_M1; v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); @@ -83,8 +88,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -104,8 +114,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -124,8 +139,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -145,8 +165,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c index b94b5f4bf..ee334801b 100644 --- a/kernel/riscv64/zgemm_beta_rvv.c +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -41,8 +41,11 @@ #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 @@ -50,8 +53,11 @@ #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 @@ -68,6 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT *c_offset; size_t vl; FLOAT_V_T vr, vi, v1, v2, v3, v4; + FLOAT_VX2_T vx2; ldc *= 2; c_offset = c; @@ -77,6 +84,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vl = VSETVL(m); vr = VFMVVF_FLOAT(0.0, vl); vi = VFMVVF_FLOAT(0.0, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); for( ; n > 0; n--, c += ldc) { c_offset = c; @@ -84,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VSSEG_FLOAT(c_offset, vr, vi, vl); + VSSEG_FLOAT(c_offset, vx2, vl); } } @@ -96,7 +105,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VLSEG_FLOAT(&vr, &vi, c_offset, vl); + vx2 = VLSEG_FLOAT(c_offset, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); v1 = VFMULVF_FLOAT(vr, beta_r, vl); v2 = VFMULVF_FLOAT(vi, beta_i, vl); @@ -107,7 +118,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vr = VFSUBVV_FLOAT(v1, v2, vl); vi = VFADDVV_FLOAT(v3, v4, vl); - VSSEG_FLOAT(c_offset, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(c_offset, vx2, vl); } } diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c index d50a4b8d5..dce98752e 100644 --- a/kernel/riscv64/zgemm_ncopy_4_rvv.c +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -29,18 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VGET_VX2 __riscv_vget_v_f32m1x2_f32m1 +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VGET_VX2 __riscv_vget_v_f64m1x2_f64m1 +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/zgemm_ncopy_4.c @@ -53,7 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ FLOAT *boffset; - FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42; + FLOAT_VX2_T v1x2, v2x2, v3x2, v4x2; + FLOAT_VX4_T vxx4; + FLOAT_VX8_T vxx8; size_t vl; aoffset = a; @@ -69,12 +83,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); - VLSEG2_FLOAT(&v31, &v32, aoffset3, vl); - VLSEG2_FLOAT(&v41, &v42, aoffset4, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + v3x2 = VLSEG2_FLOAT(aoffset3, vl); + v4x2 = VLSEG2_FLOAT(aoffset4, vl); + + vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0)); + vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1)); + vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0)); + vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1)); + vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0)); + vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1)); + vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0)); + vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1)); - VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl); + VSSEG8_FLOAT(boffset, vxx8, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -91,10 +114,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + + vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0)); + vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1)); + vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0)); + vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1)); - VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl); + VSSEG4_FLOAT(boffset, vxx4, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -108,9 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); - VSSEG2_FLOAT(boffset, v11, v12, vl); + VSSEG2_FLOAT(boffset, v1x2, vl); aoffset1 += vl * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c index 1d3b8d3b7..275daa5f2 100644 --- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -30,14 +30,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ @@ -48,7 +48,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ FLOAT *a_offset1; FLOAT *b_offset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a_offset += vl * lda * 2; for(i = m; i > 0; i--) { - VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(b_offset, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += 2; b_offset += vl * 2; diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index 8c35b5616..cfafbf0dc 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -30,25 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ @@ -60,7 +66,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ IFLOAT *boffset, *boffset1, *boffset2, *boffset3; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -81,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -91,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -101,8 +111,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -119,8 +129,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -129,8 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -139,8 +149,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); //aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c index 7a085269c..96e986502 100644 --- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -29,14 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -47,7 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *aoffset1; IFLOAT *boffset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) aoffset += vl * 2; for(i = m; i > 0; i--) { - VLSEG2_FLOAT(&v0, &v1, aoffset1, vl); - VSSEG2_FLOAT(boffset, v0, v1, vl); + vx2 = VLSEG2_FLOAT(aoffset1, vl); + VSSEG2_FLOAT(boffset, vx2, vl); aoffset1 += lda * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c index 41399cf79..77e012ff5 100644 --- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -30,20 +30,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -80,6 +86,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b BLASLONG i,j,k; FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -109,10 +116,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -137,7 +148,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -162,7 +175,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -211,7 +226,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -237,35 +254,57 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); - VLSEG2_FLOAT(&va0, &va1, C2, vl); - VLSEG2_FLOAT(&va2, &va3, C3, vl); + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + vax2 = VLSEG2_FLOAT(C2, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C3, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -294,9 +333,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -311,7 +355,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -326,7 +372,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -356,7 +404,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -372,20 +422,31 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -409,9 +470,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -420,7 +486,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -430,7 +498,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -448,7 +518,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -458,12 +530,18 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); C0 += vl * 2; } diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c index 4a40c30a7..f14ef5ba8 100644 --- a/kernel/riscv64/zgemv_n_rvv.c +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -62,6 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a_ptr; FLOAT temp_r, temp_i; FLOAT_V_T va0, va1, vy0, vy1; + FLOAT_VX2_T vax2, vyx2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -73,7 +80,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -84,7 +94,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -113,7 +126,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSEG_FLOAT(y, vy0, vy1, vl); + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } @@ -123,7 +139,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -134,7 +152,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -163,7 +183,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } return(0); diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index 2f0380530..1c89a9f72 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -32,9 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f32m4_f32m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -46,9 +48,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f64m4_f64m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -66,6 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT_V_T va0, va1, vx0, vx1, vr, vi; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vax2; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; //BLASLONG stride_a = sizeof(FLOAT) * 2; @@ -73,6 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL(m); @@ -86,8 +92,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSEG_FLOAT(&x[ix], vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -104,9 +115,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) @@ -130,8 +141,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -148,9 +164,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index 79b20a646..97013895a 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -81,6 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -96,8 +103,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posY * 2 + posX * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -111,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index a86815275..59029e9e5 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -79,6 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -94,8 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -108,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index d2b27aa8d..32f67758a 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -28,95 +28,248 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define MASK_T vbool8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define MASK_T vbool16_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif -// TODO: Should single precision use the widening MAC, or perhaps all should be double? - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i=0, j=0; - if ( n <= 0 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); - FLOAT_V_T vr, v0, v1; - FLOAT_V_T_M1 v_max, v_res; FLOAT scale = 0.0, ssq = 0.0; - - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); - - vr = VFMVVF_FLOAT(0, vlmax); - + MASK_T mask; + BLASLONG index = 0; if (inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - - VLSEG_FLOAT(&v0, &v1, x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for (i=0,j=0; i 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + //ssq in vector vr + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + j += gvl; + idx += inc_v; } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); + //tail + if (j < n) { + gvl = VSETVL(n-j); + v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if(index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + } else { // found greater element + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) {//no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); + } } - - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); - - ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - - return(scale * sqrt(ssq)); + return(scale * sqrt(ssq)); } diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c index ee81bfe91..1d5390684 100644 --- a/kernel/riscv64/zrot_rvv.c +++ b/kernel/riscv64/zrot_rvv.c @@ -30,28 +30,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -63,6 +69,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if (n <= 0) return(0); FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2, vtx2; if (inc_x == 0 && inc_y == 0) { BLASLONG i=0; @@ -93,8 +100,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -105,8 +117,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1){ @@ -115,8 +132,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -127,8 +149,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1){ @@ -137,8 +164,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -149,8 +181,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -160,8 +197,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -172,8 +214,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 779fab68c..2586c6036 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 @@ -43,10 +46,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -61,6 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F FLOAT_V_T vt, vr, vi; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); size_t vlmax = VSETVL_MAX; + FLOAT_VX2_T vx2; if(da_r == 0.0 && da_i == 0.0) { @@ -71,16 +78,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - VSSEG_FLOAT(x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } @@ -89,12 +98,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vi, -da_i, vl); vi = VFMULVF_FLOAT(vr, da_i, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else if(da_i == 0.0) { @@ -102,12 +116,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vr = VFMULVF_FLOAT(vr, da_r, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else { @@ -117,14 +135,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vr, &vi, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSEG_FLOAT(x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { @@ -132,14 +154,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } } diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index b41f70eb5..489188bd5 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -32,8 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -44,8 +46,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -59,6 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(sumf); FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vlmax = VSETVL_MAX; FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); @@ -67,7 +72,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); @@ -80,7 +88,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c index 17b7b9f43..c2adf5e05 100644 --- a/kernel/riscv64/zswap_rvv.c +++ b/kernel/riscv64/zswap_rvv.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n <= 0) return(0); - FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if (inc_x == 0 && inc_y == 0) { if (n & 1) { @@ -75,8 +75,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); - VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vyx2, vl); } y[0] = temp[0]; y[1] = temp[1]; @@ -92,8 +92,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); - VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vxx2, vl); } x[0] = temp[0]; x[1] = temp[1]; @@ -103,11 +103,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_x == 1){ @@ -116,11 +116,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_y == 1){ @@ -129,11 +129,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } else { @@ -143,11 +143,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index 0f9e04869..f4d806190 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -70,6 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -85,15 +92,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index fdc693700..069551bb0 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -71,6 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -86,15 +93,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + 0 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index 7276618c5..ae664561b 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vint32m2_t #define VID_V_UINT __riscv_vid_v_i32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -69,6 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -98,8 +103,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X > posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -119,7 +124,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -128,7 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c index 72e8f2ce2..ab8d34337 100644 --- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -32,11 +32,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -46,11 +48,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -65,6 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; #ifdef UNIT @@ -101,8 +106,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON else if (X < posY) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; @@ -115,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (unsigned int j = 0; j < vl; j++) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +132,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c index e6d36c86d..ba6e63b96 100644 --- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -67,6 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -96,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X < posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -118,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -127,7 +135,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c index 7085cfc37..a624fff54 100644 --- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -34,11 +34,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -48,11 +50,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -66,6 +70,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, j, js, X; FLOAT *ao; + + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; #ifdef UNIT VBOOL_T vbool_eq; @@ -103,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else if (X > posY) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; X++; @@ -115,7 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +133,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c index 92b4b855b..db5f06af8 100644 --- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -30,10 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 @@ -41,10 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -85,6 +91,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b off = 0; #endif + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -130,10 +137,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -158,7 +169,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -183,7 +196,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -233,7 +248,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -262,25 +279,37 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); va0 = VFMULVF_FLOAT(vres4, alphar, vl); va1 = VFMULVF_FLOAT(vres5, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMULVF_FLOAT(vres6, alphar, vl); va3 = VFMULVF_FLOAT(vres7, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -342,10 +371,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -360,7 +393,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -375,7 +410,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -405,7 +442,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -425,13 +464,19 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -487,10 +532,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -500,7 +549,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -510,7 +561,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -530,7 +583,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -545,7 +600,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c index 383cb883f..36cec711d 100644 --- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG stride_lda = sizeof(FLOAT)*lda*2; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; size_t vl; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); ao += 2; @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c index f57e9f1de..3a7bdb522 100644 --- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); b += vl * 2; ao += lda * 2; @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c index be3613429..2a158d4de 100644 --- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -31,20 +31,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -62,7 +62,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -83,9 +83,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT for (unsigned int j = 0; j < vl; j++) { compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); ao += 2; b += vl * 2; } @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c index b1f5ef8f0..4b3319588 100644 --- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -81,9 +81,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/param.h b/param.h index c5c70b78e..d93221d28 100644 --- a/param.h +++ b/param.h @@ -3057,7 +3057,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_MN 16 +#define CGEMM_DEFAULT_UNROLL_MN 32 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 4