Merge branch 'develop' of https://github.com/HellerZheng/OpenBLAS_riscv_x280 into HellerZheng-develop

This commit is contained in:
Xianyi Zhang 2022-12-03 12:00:52 +08:00
commit e5313f53d5
71 changed files with 16070 additions and 3439 deletions

View File

@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V)
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
endif
ifeq ($(TARGET), x280)
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
endif
ifeq ($(TARGET), RISCV64_GENERIC)
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
endif
all: getarch_2nd
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)

View File

@ -2,3 +2,11 @@ ifeq ($(CORE), C910V)
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
endif
ifeq ($(CORE), x280)
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
endif
ifeq ($(CORE), RISCV64_GENERIC)
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
endif

View File

@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
```
(also known to work on C906)
- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0.
```sh
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
```
### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.

View File

@ -120,6 +120,7 @@ Z14
10.RISC-V 64:
RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
C910V
x280
11.LOONGARCH64:
LOONGSONGENERIC

File diff suppressed because it is too large Load Diff

View File

@ -95,4 +95,8 @@ static inline int blas_quickdivide(blasint x, blasint y){
#include <riscv_vector.h>
#endif
#if defined(x280)
#include <riscv_vector.h>
#endif
#endif

View File

@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_GENERIC 0
#define CPU_C910V 1
#define CPU_x280 2
static char *cpuname[] = {
"RISCV64_GENERIC",
"C910V"
"x280"
};
int detect(void){

View File

@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "c910v"
#define CORENAME "C910V"
#endif
#endif
#ifdef FORCE_x280
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "x280"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-Dx280 " \
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "x280"
#define CORENAME "x280"
#else
#endif

235
kernel/riscv64/KERNEL.x280 Normal file
View File

@ -0,0 +1,235 @@
# **********************************************************************************
# Copyright (c) 2022, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# **********************************************************************************
SAMAXKERNEL = amax_rvv.c
DAMAXKERNEL = amax_rvv.c
CAMAXKERNEL = zamax_rvv.c
ZAMAXKERNEL = zamax_rvv.c
SAMINKERNEL = amin_rvv.c
DAMINKERNEL = amin_rvv.c
CAMINKERNEL = zamin_rvv.c
ZAMINKERNEL = zamin_rvv.c
SMAXKERNEL = max_rvv.c
DMAXKERNEL = max_rvv.c
SMINKERNEL = min_rvv.c
DMINKERNEL = min_rvv.c
ISAMAXKERNEL = iamax_rvv.c
IDAMAXKERNEL = iamax_rvv.c
ICAMAXKERNEL = izamax_rvv.c
IZAMAXKERNEL = izamax_rvv.c
ISAMINKERNEL = iamin_rvv.c
IDAMINKERNEL = iamin_rvv.c
ICAMINKERNEL = izamin_rvv.c
IZAMINKERNEL = izamin_rvv.c
ISMAXKERNEL = imax_rvv.c
IDMAXKERNEL = imax_rvv.c
ISMINKERNEL = imin_rvv.c
IDMINKERNEL = imin_rvv.c
SASUMKERNEL = asum_rvv.c
DASUMKERNEL = asum_rvv.c
CASUMKERNEL = zasum_rvv.c
ZASUMKERNEL = zasum_rvv.c
SSUMKERNEL = sum_rvv.c
DSUMKERNEL = sum_rvv.c
CSUMKERNEL = zsum_rvv.c
ZSUMKERNEL = zsum_rvv.c
SAXPYKERNEL = axpy_rvv.c
DAXPYKERNEL = axpy_rvv.c
CAXPYKERNEL = zaxpy_rvv.c
ZAXPYKERNEL = zaxpy_rvv.c
SAXPBYKERNEL = axpby_rvv.c
DAXPBYKERNEL = axpby_rvv.c
CAXPBYKERNEL = zaxpby_rvv.c
ZAXPBYKERNEL = zaxpby_rvv.c
SCOPYKERNEL = copy_rvv.c
DCOPYKERNEL = copy_rvv.c
CCOPYKERNEL = zcopy_rvv.c
ZCOPYKERNEL = zcopy_rvv.c
SDOTKERNEL = dot_rvv.c
DDOTKERNEL = dot_rvv.c
CDOTKERNEL = zdot_rvv.c
ZDOTKERNEL = zdot_rvv.c
DSDOTKERNEL = dot_rvv.c
SNRM2KERNEL = nrm2_rvv.c
DNRM2KERNEL = nrm2_rvv.c
CNRM2KERNEL = znrm2_rvv.c
ZNRM2KERNEL = znrm2_rvv.c
SROTKERNEL = rot_rvv.c
DROTKERNEL = rot_rvv.c
CROTKERNEL = zrot_rvv.c
ZROTKERNEL = zrot_rvv.c
SSCALKERNEL = scal_rvv.c
DSCALKERNEL = scal_rvv.c
CSCALKERNEL = zscal_rvv.c
ZSCALKERNEL = zscal_rvv.c
SSWAPKERNEL = swap_rvv.c
DSWAPKERNEL = swap_rvv.c
CSWAPKERNEL = zswap_rvv.c
ZSWAPKERNEL = zswap_rvv.c
SGEMVNKERNEL = gemv_n_rvv.c
DGEMVNKERNEL = gemv_n_rvv.c
CGEMVNKERNEL = zgemv_n_rvv.c
ZGEMVNKERNEL = zgemv_n_rvv.c
SGEMVTKERNEL = gemv_t_rvv.c
DGEMVTKERNEL = gemv_t_rvv.c
CGEMVTKERNEL = zgemv_t_rvv.c
ZGEMVTKERNEL = zgemv_t_rvv.c
CTRMMKERNEL = ztrmmkernel_2x2_rvv.c
ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c
# SGEMM_UNROLL_N set in params.h
ifeq ($(SGEMM_UNROLL_N), 8)
# UNROLL_M is VLMAX
SGEMMKERNEL = gemmkernel_rvv_v1x8.c
SGEMMINCOPY = gemm_ncopy_rvv_v1.c
SGEMMITCOPY = gemm_tcopy_rvv_v1.c
SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMKERNEL = trmmkernel_rvv_v1x8.c
STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c
STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c
STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c
STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c
SSYMMUCOPY_M = symm_ucopy_rvv_v1.c
SSYMMLCOPY_M = symm_lcopy_rvv_v1.c
endif
# SGEMM_UNROLL_N set in params.h
ifeq ($(DGEMM_UNROLL_N), 8)
# UNROLL_M is VLMAX
DGEMMKERNEL = gemmkernel_rvv_v1x8.c
DGEMMINCOPY = gemm_ncopy_rvv_v1.c
DGEMMITCOPY = gemm_tcopy_rvv_v1.c
DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c
DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMKERNEL = trmmkernel_rvv_v1x8.c
DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c
DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c
DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c
DSYMMUCOPY_M = symm_ucopy_rvv_v1.c
DSYMMLCOPY_M = symm_lcopy_rvv_v1.c
endif
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c
TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c
TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c
TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c
SSYMV_U_KERNEL = symv_U_rvv.c
SSYMV_L_KERNEL = symv_L_rvv.c
DSYMV_U_KERNEL = symv_U_rvv.c
DSYMV_L_KERNEL = symv_L_rvv.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
ifndef SGEMM_BETA
SGEMM_BETA = gemm_beta_rvv.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = gemm_beta_rvv.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = zgemm_beta_rvv.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = zgemm_beta_rvv.c
endif

102
kernel/riscv64/amax_rvv.c Normal file
View File

@ -0,0 +1,102 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VFABSV_FLOAT vfabs_v_f32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VFABSV_FLOAT vfabs_v_f64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return(maxf);
FLOAT_V_T vx, vmax;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vmax = VFMVVF_FLOAT(0.0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vx = VFABSV_FLOAT(vx, vl);
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vx = VFABSV_FLOAT(vx, vl);
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
}
}
v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
maxf = VFMVFS_FLOAT_M1(v_res);
return(maxf);
}

102
kernel/riscv64/amin_rvv.c Normal file
View File

@ -0,0 +1,102 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VFABSV_FLOAT vfabs_v_f32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VFABSV_FLOAT vfabs_v_f64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return(minf);
FLOAT_V_T vx, vmin;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vx = VFABSV_FLOAT(vx, vl);
vmin = VFMINVV_FLOAT(vmin, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vx = VFABSV_FLOAT(vx, vl);
vmin = VFMINVV_FLOAT(vmin, vx, vl);
}
}
v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
minf = VFMVFS_FLOAT_M1(v_res);
return(minf);
}

99
kernel/riscv64/asum_rvv.c Normal file
View File

@ -0,0 +1,99 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFADDVV_FLOAT vfadd_vv_f32m8
#define VFABSV_FLOAT vfabs_v_f32m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFADDVV_FLOAT vfadd_vv_f64m8
#define VFABSV_FLOAT vfabs_v_f64m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT asumf = 0.0;
if (n <= 0 || inc_x <= 0) return(asumf);
FLOAT_V_T vx, vsum;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vsum = VFMVVF_FLOAT(0.0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vx = VFABSV_FLOAT(vx, vl);
vsum = VFADDVV_FLOAT(vsum, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vx = VFABSV_FLOAT(vx, vl);
vsum = VFADDVV_FLOAT(vsum, vx, vl);
}
}
v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax);
asumf = VFMVFS_FLOAT_M1(v_res);
return(asumf);
}

171
kernel/riscv64/axpby_rvv.c Normal file
View File

@ -0,0 +1,171 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
#define VFMULVF_FLOAT vfmul_vf_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
#define VFMULVF_FLOAT vfmul_vf_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#endif
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
{
FLOAT_V_T vx, vy;
if ( n < 0 ) return(0);
if ( beta == 0.0 ) {
if ( alpha == 0.0 ) {
if (1 == inc_y) {
memset(&y[0], 0, n * sizeof(FLOAT));
} else {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
size_t vl = VSETVL(n);
vy = VFMVVF_FLOAT(0.0, vl);
for ( ; n > 0; n -= vl, y += vl*stride_y) {
vl = VSETVL(n);
VSSEV_FLOAT(y, stride_y, vy, vl);
}
}
} else {
if ((1 == inc_x) && (1 == inc_y)) {
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VFMULVF_FLOAT(vx, alpha, vl);
VSEV_FLOAT (y, vy, vl);
}
} else if (1 == inc_x) {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VFMULVF_FLOAT(vx, alpha, vl);
VSSEV_FLOAT (y, stride_y, vy, vl);
}
} else if (1 == inc_y) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VFMULVF_FLOAT(vx, alpha, vl);
VSEV_FLOAT (y, vy, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VFMULVF_FLOAT(vx, alpha, vl);
VSSEV_FLOAT (y, stride_y, vy, vl);
}
}
}
} else {
if ( alpha == 0.0 ) {
if (1 == inc_y) {
for (size_t vl; n > 0; n -= vl, y += vl) {
vl = VSETVL(n);
vy = VLEV_FLOAT(y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
VSEV_FLOAT (y, vy, vl);
}
} else {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, y += vl*inc_y) {
vl = VSETVL(n);
vy = VLSEV_FLOAT(y, stride_y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
VSSEV_FLOAT (y, stride_y, vy, vl);
}
}
} else {
if ((1 == inc_x) && (1 == inc_y)) {
for (size_t vl; n > 0; n -= vl, y += vl) {
vl = VSETVL(n);
vy = VLEV_FLOAT(y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
VSEV_FLOAT (y, vy, vl);
}
} else if (1 == inc_x) {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
VSSEV_FLOAT (y, stride_y, vy, vl);
}
} else if (1 == inc_y) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLEV_FLOAT(y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
VSEV_FLOAT (y, vy, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
VSSEV_FLOAT (y, stride_y, vy, vl);
}
}
}
}
return(0);
}

109
kernel/riscv64/axpy_rvv.c Normal file
View File

@ -0,0 +1,109 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
if ( n <= 0 ) return(0);
if ( da == 0.0 ) return(0);
FLOAT_V_T vx, vy;
if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLEV_FLOAT(y, vl);
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
VSEV_FLOAT (y, vy, vl);
}
} else if (1 == inc_y) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLEV_FLOAT(y, vl);
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
VSEV_FLOAT(y, vy, vl);
}
} else if (1 == inc_x) {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
VSSEV_FLOAT(y, stride_y, vy, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
VSSEV_FLOAT(y, stride_y, vy, vl);
}
}
return(0);
}

94
kernel/riscv64/copy_rvv.c Normal file
View File

@ -0,0 +1,94 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
if(n < 0) return(0);
FLOAT_V_T v0;
if(inc_x == 1 && inc_y == 1) {
for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL(n);
v0 = VLEV_FLOAT(x, vl);
VSEV_FLOAT(y, v0, vl);
}
} else if (inc_y == 1) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
vl = VSETVL(n);
v0 = VLSEV_FLOAT(x, stride_x, vl);
VSEV_FLOAT(y, v0, vl);
}
} else if(inc_x == 1) {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
vl = VSETVL(n);
v0 = VLEV_FLOAT(x, vl);
VSSEV_FLOAT(y, stride_y, v0, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
vl = VSETVL(n);
v0 = VLSEV_FLOAT(x, stride_x, vl);
VSSEV_FLOAT(y, stride_y, v0, vl);
}
}
return(0);
}

126
kernel/riscv64/dot_rvv.c Normal file
View File

@ -0,0 +1,126 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
double dot = 0.0;
if ( n <= 0 ) return(dot);
size_t vlmax = vsetvlmax_e64m8();
vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax);
if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = vsetvl_e64m8(n);
#if !defined(DOUBLE)
vfloat32m4_t vx = vle32_v_f32m4(x, vl);
vfloat32m4_t vy = vle32_v_f32m4(y, vl);
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
#else
vfloat64m8_t vx = vle64_v_f64m8(x, vl);
vfloat64m8_t vy = vle64_v_f64m8(y, vl);
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
#endif
}
} else if (1 == inc_x) {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
vl = vsetvl_e64m8(n);
#if !defined(DOUBLE)
vfloat32m4_t vx = vle32_v_f32m4(x, vl);
vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
#else
vfloat64m8_t vx = vle64_v_f64m8(x, vl);
vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
#endif
}
} else if (1 == inc_y) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
vl = vsetvl_e64m8(n);
#if !defined(DOUBLE)
vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
vfloat32m4_t vy = vle32_v_f32m4(y, vl);
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
#else
vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
vfloat64m8_t vy = vle64_v_f64m8(y, vl);
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
#endif
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
vl = vsetvl_e64m8(n);
#if !defined(DOUBLE)
vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
#else
vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
#endif
}
}
vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax);
vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax);
dot = vfmv_f_s_f64m1_f64(vec_sum);
return(dot);
}

View File

@ -0,0 +1,89 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMULVF_FLOAT vfmul_vf_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMULVF_FLOAT vfmul_vf_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#endif
// Optimizes the implementation in ../generic/gemm_beta.c
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
FLOAT *c, BLASLONG ldc)
{
BLASLONG chunk;
FLOAT *c_offset;
size_t vl;
FLOAT_V_T vx;
if (beta == ZERO) {
vl = VSETVL(m);
vx = VFMVVF_FLOAT(0.0, vl);
for( ; n > 0; n--, c += ldc) {
c_offset = c;
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
vl = VSETVL(chunk);
VSEV_FLOAT(c_offset, vx, vl);
}
}
} else {
for( ; n > 0; n--, c += ldc) {
c_offset = c;
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
vl = VSETVL(chunk);
vx = VLEV_FLOAT(c_offset, vl);
vx = VFMULVF_FLOAT(vx, beta, vl);
VSEV_FLOAT(c_offset, vx, vl);
}
}
}
return 0;
}

View File

@ -0,0 +1,164 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m1
#define VSEV_FLOAT vse32_v_f32m1
#define VSSEG2_FLOAT vsseg2e32_v_f32m1
#define VSSEG4_FLOAT vsseg4e32_v_f32m1
#define VSSEG8_FLOAT vsseg8e32_v_f32m1
#else
#define VSETVL(n) vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m1
#define VSEV_FLOAT vse64_v_f64m1
#define VSSEG2_FLOAT vsseg2e64_v_f64m1
#define VSSEG4_FLOAT vsseg4e64_v_f64m1
#define VSSEG8_FLOAT vsseg8e64_v_f64m1
#endif
// Optimizes the implementation in ../generic/gemm_ncopy_8.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
{
BLASLONG i, j;
FLOAT *a_offset;
FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8;
FLOAT *b_offset;
FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
size_t vl;
//fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
a_offset = a;
b_offset = b;
for(j = (n >> 3); j > 0; j--) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset5 = a_offset4 + lda;
a_offset6 = a_offset5 + lda;
a_offset7 = a_offset6 + lda;
a_offset8 = a_offset7 + lda;
a_offset += 8 * lda;
for(i = m; i > 0; i -= vl) {
vl = VSETVL(i);
v1 = VLEV_FLOAT(a_offset1, vl);
v2 = VLEV_FLOAT(a_offset2, vl);
v3 = VLEV_FLOAT(a_offset3, vl);
v4 = VLEV_FLOAT(a_offset4, vl);
v5 = VLEV_FLOAT(a_offset5, vl);
v6 = VLEV_FLOAT(a_offset6, vl);
v7 = VLEV_FLOAT(a_offset7, vl);
v8 = VLEV_FLOAT(a_offset8, vl);
VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl);
a_offset1 += vl;
a_offset2 += vl;
a_offset3 += vl;
a_offset4 += vl;
a_offset5 += vl;
a_offset6 += vl;
a_offset7 += vl;
a_offset8 += vl;
b_offset += vl*8;
}
}
if (n & 4) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
for(i = m; i > 0; i -= vl) {
vl = VSETVL(i);
v1 = VLEV_FLOAT(a_offset1, vl);
v2 = VLEV_FLOAT(a_offset2, vl);
v3 = VLEV_FLOAT(a_offset3, vl);
v4 = VLEV_FLOAT(a_offset4, vl);
VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
a_offset1 += vl;
a_offset2 += vl;
a_offset3 += vl;
a_offset4 += vl;
b_offset += vl*4;
}
}
if (n & 2) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
for(i = m; i > 0; i -= vl) {
vl = VSETVL(i);
v1 = VLEV_FLOAT(a_offset1, vl);
v2 = VLEV_FLOAT(a_offset2, vl);
VSSEG2_FLOAT(b_offset, v1, v2, vl);
a_offset1 += vl;
a_offset2 += vl;
b_offset += vl*2;
}
}
if (n & 1) {
a_offset1 = a_offset;
for(i = m; i > 0; i -= vl) {
vl = VSETVL(i);
v1 = VLEV_FLOAT(a_offset1, vl);
VSEV_FLOAT(b_offset, v1, vl);
a_offset1 += vl;
b_offset += vl;
}
}
return 0;
}

View File

@ -0,0 +1,76 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
{
BLASLONG i, j;
FLOAT *a_offset;
FLOAT *a_offset1;
FLOAT *b_offset;
FLOAT_V_T v0;
size_t vl;
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
a_offset = a;
b_offset = b;
for(j = n; j > 0; j -= vl) {
vl = VSETVL(j);
a_offset1 = a_offset;
a_offset += vl * lda;
for(i = m; i > 0; i--) {
v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl);
VSEV_FLOAT(b_offset, v0, vl);
a_offset1++;
b_offset += vl;
}
}
return 0;
}

View File

@ -0,0 +1,264 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m1
#define VLSEV_FLOAT vlse32_v_f32m1
#define VSEV_FLOAT vse32_v_f32m1
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1
#define VSSEG2_FLOAT vsseg2e32_v_f32m1
#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1
#define VSSEG4_FLOAT vsseg4e32_v_f32m1
#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1
#define VSSEG8_FLOAT vsseg8e32_v_f32m1
#else
#define VSETVL(n) vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m1
#define VLSEV_FLOAT vlse64_v_f64m1
#define VSEV_FLOAT vse64_v_f64m1
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1
#define VSSEG2_FLOAT vsseg2e64_v_f64m1
#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1
#define VSSEG4_FLOAT vsseg4e64_v_f64m1
#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1
#define VSSEG8_FLOAT vsseg8e64_v_f64m1
#endif
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
{
BLASLONG i, j;
IFLOAT *aoffset;
IFLOAT *aoffset1;
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
aoffset = a;
boffset = b;
boffset2 = b + m * (n & ~7);
boffset3 = b + m * (n & ~3);
boffset4 = b + m * (n & ~1);
for(j = (m >> 3); j > 0; j--) {
aoffset1 = aoffset;
aoffset += 8 * lda;
boffset1 = boffset;
boffset += 64;
for(i = (n >> 3); i > 0; i--) {
size_t vl = 8;
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
aoffset1 += 8;
boffset1 += m * 8;
}
if (n & 4) {
size_t vl = 8;
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
aoffset1 += 4;
boffset2 += 32;
}
if (n & 2) {
size_t vl = 8;
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
aoffset1 += 2;
boffset3 += 16;
}
if (n & 1) {
size_t vl = 8;
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSEV_FLOAT(boffset4, v0, vl);
aoffset1 += 1;
boffset4 += 8;
}
}
if (m & 4) {
aoffset1 = aoffset;
aoffset += 4 * lda;
boffset1 = boffset;
boffset += 32;
for(i = (n >> 3); i > 0; i--) {
size_t vl = 4;
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
aoffset1 += 8;
boffset1 += m * 8;
}
if (n & 4) {
size_t vl = 4;
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
aoffset1 += 4;
boffset2 += 16;
}
if (n & 2) {
size_t vl = 4;
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
aoffset1 += 2;
boffset3 += 8;
}
if (n & 1) {
size_t vl = 4;
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSEV_FLOAT(boffset4, v0, vl);
aoffset1 += 1;
boffset4 += 4;
}
}
if (m & 2) {
aoffset1 = aoffset;
aoffset += 2 * lda;
boffset1 = boffset;
boffset += 16;
for(i = (n >> 3); i > 0; i--) {
size_t vl = 2;
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
aoffset1 += 8;
boffset1 += m * 8;
}
if (n & 4) {
size_t vl = 2;
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
aoffset1 += 4;
boffset2 += 8;
}
if (n & 2) {
size_t vl = 2;
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
aoffset1 += 2;
boffset3 += 4;
}
if (n & 1) {
size_t vl = 2;
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSEV_FLOAT(boffset4, v0, vl);
aoffset1 += 1;
boffset4 += 2;
}
}
if (m & 1) {
aoffset1 = aoffset;
boffset1 = boffset;
for(i = (n >> 3); i > 0; i--) {
size_t vl = 8;
v0 = VLEV_FLOAT(aoffset1, vl);
VSEV_FLOAT(boffset1, v0, vl);
aoffset1 += 8;
boffset1 += 8 * m;
}
if (n & 4) {
size_t vl = 4;
v0 = VLEV_FLOAT(aoffset1, vl);
VSEV_FLOAT(boffset2, v0, vl);
aoffset1 += 4;
//boffset2 += 4;
}
if (n & 2) {
size_t vl = 2;
v0 = VLEV_FLOAT(aoffset1, vl);
VSEV_FLOAT(boffset3, v0, vl);
aoffset1 += 2;
// boffset3 += 2;
}
if (n & 1) {
*(boffset4) = *(aoffset1);
// aoffset1 ++;
// boffset4 ++;
}
}
return 0;
}

View File

@ -0,0 +1,74 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#endif
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
{
BLASLONG i, j;
IFLOAT *aoffset;
IFLOAT *aoffset1;
IFLOAT *boffset;
FLOAT_V_T v0;
size_t vl;
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
aoffset = a;
boffset = b;
for(j = n; j > 0; j -= vl) {
vl = VSETVL(j);
aoffset1 = aoffset;
aoffset += vl;
for(i = m; i > 0; i--) {
v0 = VLEV_FLOAT(aoffset1, vl);
VSEV_FLOAT(boffset, v0, vl);
aoffset1 += lda;
boffset += vl;
}
}
return 0;
}

View File

@ -0,0 +1,601 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VFMVVF_FLOAT vfmv_v_f_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#endif
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7;
IFLOAT *ptrba,*ptrbb;
//fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
size_t vl;
// N:8
for (j = bn/8; j > 0; j--) {
C0 = C;
C1 = C0 + ldc;
C2 = C1 + ldc;
C3 = C2 + ldc;
C4 = C3 + ldc;
C5 = C4 + ldc;
C6 = C5 + ldc;
C7 = C6 + ldc;
ptrba = ba;
for (i = bm; i > 0; i -= vl) {
vl = VSETVL(i);
ptrbb = bb;
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
vres4 = VFMVVF_FLOAT(0.0, vl);
vres5 = VFMVVF_FLOAT(0.0, vl);
vres6 = VFMVVF_FLOAT(0.0, vl);
vres7 = VFMVVF_FLOAT(0.0, vl);
#if 0
for (k = bk; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
ptrba += vl;
ptrbb += 8;
}
#else
// Unroll K
for (k = bk/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
ptrbb += 8;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
ptrbb += 8;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
ptrbb += 8;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
ptrbb += 8;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
ptrbb += 8;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
ptrbb += 8;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
ptrbb += 8;
}
// K remainder
for (k = bk&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
ptrba += vl;
}
#endif
va0 = VLEV_FLOAT(C0, vl);
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
VSEV_FLOAT(C0, va0, vl);
va1 = VLEV_FLOAT(C1, vl);
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
VSEV_FLOAT(C1, va1, vl);
va2 = VLEV_FLOAT(C2, vl);
va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
VSEV_FLOAT(C2, va2, vl);
va3 = VLEV_FLOAT(C3, vl);
va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
VSEV_FLOAT(C3, va3, vl);
va4 = VLEV_FLOAT(C4, vl);
va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl);
VSEV_FLOAT(C4, va4, vl);
va5 = VLEV_FLOAT(C5, vl);
va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl);
VSEV_FLOAT(C5, va5, vl);
va6 = VLEV_FLOAT(C6, vl);
va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl);
VSEV_FLOAT(C6, va6, vl);
va7 = VLEV_FLOAT(C7, vl);
va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl);
VSEV_FLOAT(C7, va7, vl);
C0 += vl;
C1 += vl;
C2 += vl;
C3 += vl;
C4 += vl;
C5 += vl;
C6 += vl;
C7 += vl;
}
bb += (bk<<3);
C += (ldc<<3);
}
// N:4
if (bn & 4) {
C0 = C;
C1 = C0 + ldc;
C2 = C1 + ldc;
C3 = C2 + ldc;
ptrba = ba;
for (i = bm; i > 0; i -= vl) {
vl = VSETVL(i);
ptrbb = bb;
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
#if 0
for (k = bk; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
ptrba += vl;
ptrbb += 4;
}
#else
// Unroll K
for (k = bk/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
ptrbb += 4;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
ptrbb += 4;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
ptrbb += 4;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
ptrbb += 4;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
ptrbb += 4;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
ptrbb += 4;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
ptrbb += 4;
}
// K remainder
for (k = bk&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
ptrba += vl;
}
#endif
va0 = VLEV_FLOAT(C0, vl);
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
VSEV_FLOAT(C0, va0, vl);
va1 = VLEV_FLOAT(C1, vl);
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
VSEV_FLOAT(C1, va1, vl);
va2 = VLEV_FLOAT(C2, vl);
va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
VSEV_FLOAT(C2, va2, vl);
va3 = VLEV_FLOAT(C3, vl);
va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
VSEV_FLOAT(C3, va3, vl);
C0 += vl;
C1 += vl;
C2 += vl;
C3 += vl;
}
bb += (bk<<2);
C += (ldc<<2);
}
// N:2
if (bn & 2) {
C0 = C;
C1 = C0 + ldc;
ptrba = ba;
for (i = bm; i > 0; i -= vl) {
vl = VSETVL(i);
ptrbb = bb;
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
#if 0
for (k = bk; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
ptrba += vl;
ptrbb += 2;
}
#else
// Unroll K
for (k = bk/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
ptrbb += 2;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
ptrbb += 2;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
ptrbb += 2;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
ptrbb += 2;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
ptrbb += 2;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
ptrbb += 2;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
ptrbb += 2;
}
// K remainder
for (k = bk&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
ptrba += vl;
}
#endif
va0 = VLEV_FLOAT(C0, vl);
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
VSEV_FLOAT(C0, va0, vl);
va1 = VLEV_FLOAT(C1, vl);
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
VSEV_FLOAT(C1, va1, vl);
C0 += vl;
C1 += vl;
}
bb += (bk<<1);
C += (ldc<<1);
}
// N:1
if (bn & 1) {
C0 = C;
ptrba = ba;
for (i = bm; i > 0; i -= vl) {
vl = VSETVL(i);
ptrbb = bb;
vres0 = VFMVVF_FLOAT(0.0, vl);
#if 0
for (k = bk; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
ptrba += vl;
ptrbb += 1;
}
#else
// Unroll K
for (k = bk/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
ptrbb += 1;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
ptrbb += 1;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
ptrbb += 1;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
ptrbb += 1;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
ptrbb += 1;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
ptrbb += 1;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
ptrbb += 1;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
ptrbb += 1;
}
// K remainder
for (k = bk&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
ptrbb += 1;
ptrba += vl;
}
#endif
va0 = VLEV_FLOAT(C0, vl);
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
VSEV_FLOAT(C0, va0, vl);
C0 += vl;
}
bb += (bk);
C += (ldc);
}
return 0;
}

View File

@ -0,0 +1,94 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
if(n < 0) return(0);
FLOAT *a_ptr, *x_ptr;
BLASLONG i;
FLOAT_V_T va, vy;
if(inc_y == 1) {
for (size_t vl; m > 0; m -= vl, y += vl, a += vl) {
vl = VSETVL(m);
a_ptr = a;
x_ptr = x;
vy = VLEV_FLOAT(y, vl);
for(i = 0; i < n; i++) {
va = VLEV_FLOAT(a_ptr, vl);
vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
a_ptr += lda;
x_ptr += inc_x;
}
VSEV_FLOAT(y, vy, vl);
}
} else {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) {
vl = VSETVL(m);
a_ptr = a;
x_ptr = x;
vy = VLSEV_FLOAT(y, stride_y, vl);
for(i = 0; i < n; i++) {
va = VLEV_FLOAT(a_ptr, vl);
vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
a_ptr += lda;
x_ptr += inc_x;
}
VSSEV_FLOAT(y, stride_y, vy, vl);
}
}
return(0);
}

119
kernel/riscv64/gemv_t_rvv.c Normal file
View File

@ -0,0 +1,119 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j;
FLOAT *a_ptr, *x_ptr;
FLOAT_V_T va, vx, vr;
FLOAT_V_T_M1 v_res, v_z0;
size_t vlmax = VSETVL_MAX_M1;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
vlmax = VSETVL_MAX;
if(inc_x == 1) {
for(i = 0; i < n; i++) {
j = m;
a_ptr = a;
x_ptr = x;
vr = VFMVVF_FLOAT(0, vlmax);
for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) {
vl = VSETVL(j);
va = VLEV_FLOAT(a_ptr, vl);
vx = VLEV_FLOAT(x_ptr, vl);
vr = VFMACCVV_FLOAT(vr, va, vx, vl);
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
*y += alpha * VFMVFS_FLOAT_M1(v_res);
y += inc_y;
a += lda;
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for(i = 0; i < n; i++) {
j = m;
a_ptr = a;
x_ptr = x;
vr = VFMVVF_FLOAT(0, vlmax);
for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) {
vl = VSETVL(j);
va = VLEV_FLOAT(a_ptr, vl);
vx = VLSEV_FLOAT(x_ptr, stride_x, vl);
vr = VFMACCVV_FLOAT(vr, va, vx, vl);
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
*y += alpha * VFMVFS_FLOAT_M1(v_res);
y += inc_y;
a += lda;
}
}
return(0);
}

150
kernel/riscv64/iamax_rvv.c Normal file
View File

@ -0,0 +1,150 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFABSV_FLOAT vfabs_v_f64m8
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VFIRSTM vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
#define VMVVXS_UINT vmv_x_s_u64m8_u64
#else
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFABSV_FLOAT vfabs_v_f32m8
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VFIRSTM vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
#define VMVVXS_UINT vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
unsigned int max_index = 0;
if (n <= 0 || inc_x <= 0) return(max_index);
FLOAT_V_T vx, v_max;
UINT_V_T v_max_index;
MASK_T mask;
size_t vlmax = VSETVL_MAX;
v_max_index = VMVVX_UINT(0, vlmax);
v_max = VFMVVF_FLOAT(-1, vlmax);
BLASLONG j=0;
FLOAT maxf=0.0;
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vx = VFABSV_FLOAT(vx, vl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, vl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
//update v_max
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vx = VFABSV_FLOAT(vx, vl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, vl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
//update v_max
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
}
}
FLOAT_V_T_M1 v_res, v_z0;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
maxf = VFMVFS_FLOAT_M1(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
max_index = VFIRSTM(mask, vlmax);
v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
max_index = VMVVXS_UINT(v_max_index);
return(max_index+1);
}

151
kernel/riscv64/iamin_rvv.c Normal file
View File

@ -0,0 +1,151 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFABSV_FLOAT vfabs_v_f64m8
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VFIRSTM vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
#define VMVVXS_UINT vmv_x_s_u64m8_u64
#else
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFABSV_FLOAT vfabs_v_f32m8
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VFIRSTM vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
#define VMVVXS_UINT vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
unsigned int min_index = 0;
if (n <= 0 || inc_x <= 0) return(min_index);
FLOAT_V_T vx, v_min;
UINT_V_T v_min_index;
MASK_T mask;
size_t vlmax = VSETVL_MAX;
v_min_index = VMVVX_UINT(0, vlmax);
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
BLASLONG j=0;
FLOAT minf=0.0;
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vx = VFABSV_FLOAT(vx, vl);
// index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, vl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vx = VFABSV_FLOAT(vx, vl);
// index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, vl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx, vl);
}
}
FLOAT_V_T_M1 v_res, v_max;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
minf = VFMVFS_FLOAT_M1(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
min_index = VFIRSTM(mask, vlmax);
v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
min_index = VMVVXS_UINT(v_min_index);
return(min_index+1);
}

147
kernel/riscv64/imax_rvv.c Normal file
View File

@ -0,0 +1,147 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VFIRSTM vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
#define VMVVXS_UINT vmv_x_s_u64m8_u64
#else
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VFIRSTM vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
#define VMVVXS_UINT vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
unsigned int max_index = 0;
if (n <= 0 || inc_x <= 0) return(max_index);
FLOAT_V_T vx, v_max;
UINT_V_T v_max_index;
MASK_T mask;
size_t vlmax = VSETVL_MAX;
v_max_index = VMVVX_UINT(0, vlmax);
v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax);
BLASLONG j=0;
FLOAT maxf=0.0;
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, vl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx, vl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
}
}
FLOAT_V_T_M1 v_res, v_min;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax);
maxf = VFMVFS_FLOAT_M1(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
max_index = VFIRSTM(mask, vlmax);
v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
max_index = VMVVXS_UINT(v_max_index);
return(max_index+1);
}

147
kernel/riscv64/imin_rvv.c Normal file
View File

@ -0,0 +1,147 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VFIRSTM vfirst_m_b8
#define UINT_V_T vuint64m8_t
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VADDVX_UINT vadd_vx_u64m8
#define VMVVX_UINT vmv_v_x_u64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
#define VMVVXS_UINT vmv_x_s_u64m8_u64
#else
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VFIRSTM vfirst_m_b4
#define UINT_V_T vuint32m8_t
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
#define VADDVX_UINT vadd_vx_u32m8
#define VMVVX_UINT vmv_v_x_u32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
#define VMVVXS_UINT vmv_x_s_u32m8_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
unsigned int min_index = 0;
if (n <= 0 || inc_x <= 0) return(min_index);
FLOAT_V_T vx, v_min;
UINT_V_T v_min_index;
MASK_T mask;
size_t vlmax = VSETVL_MAX;
v_min_index = VMVVX_UINT(0, vlmax);
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
BLASLONG j=0;
FLOAT minf=0.0;
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
// index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, vl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
// index where element less than v_min
mask = VMFLTVV_FLOAT(vx, v_min, vl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx, vl);
}
}
FLOAT_V_T_M1 v_res, v_max;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
minf = VFMVFS_FLOAT_M1(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
min_index = VFIRSTM(mask, vlmax);
v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
min_index = VMVVXS_UINT(v_min_index);
return(min_index+1);
}

162
kernel/riscv64/izamax_rvv.c Normal file
View File

@ -0,0 +1,162 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
#define MASK_T vbool16_t
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFABSV_FLOAT vfabs_v_f64m4
#define VFMAXVV_FLOAT vfmax_vv_f64m4
#define VFADDVV_FLOAT vfadd_vv_f64m4
#define VFIRSTM vfirst_m_b16
#define UINT_V_T vuint64m4_t
#define VIDV_MASK_UINT vid_v_u64m4_m
#define VIDV_UINT vid_v_u64m4
#define VADDVX_MASK_UINT vadd_vx_u64m4_m
#define VADDVX_UINT vadd_vx_u64m4
#define VMVVX_UINT vmv_v_x_u64m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
#define VMVVXS_UINT vmv_x_s_u64m4_u64
#else
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFABSV_FLOAT vfabs_v_f32m4
#define VFMAXVV_FLOAT vfmax_vv_f32m4
#define VFADDVV_FLOAT vfadd_vv_f32m4
#define VFIRSTM vfirst_m_b8
#define UINT_V_T vuint32m4_t
#define VIDV_MASK_UINT vid_v_u32m4_m
#define VIDV_UINT vid_v_u32m4
#define VADDVX_MASK_UINT vadd_vx_u32m4_m
#define VADDVX_UINT vadd_vx_u32m4
#define VMVVX_UINT vmv_v_x_u32m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
#define VMVVXS_UINT vmv_x_s_u32m4_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
unsigned int max_index = 0;
if (n <= 0 || inc_x <= 0) return(max_index);
FLOAT_V_T vx0, vx1, v_max;
UINT_V_T v_max_index;
MASK_T mask;
size_t vlmax = VSETVL_MAX;
v_max_index = VMVVX_UINT(0, vlmax);
v_max = VFMVVF_FLOAT(-1, vlmax);
BLASLONG j=0;
FLOAT maxf=0.0;
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx0, vl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
}
}
else {
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
//index where element greater than v_max
mask = VMFLTVV_FLOAT(v_max, vx0, vl);
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
//update v_max and start_index j
v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
}
}
FLOAT_V_T_M1 v_res, v_z0;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
maxf = VFMVFS_FLOAT_M1(v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
max_index = VFIRSTM(mask, vlmax);
v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
max_index = VMVVXS_UINT(v_max_index);
return(max_index+1);
}

161
kernel/riscv64/izamin_rvv.c Normal file
View File

@ -0,0 +1,161 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if defined(DOUBLE)
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
#define MASK_T vbool16_t
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFABSV_FLOAT vfabs_v_f64m4
#define VFMINVV_FLOAT vfmin_vv_f64m4
#define VFADDVV_FLOAT vfadd_vv_f64m4
#define VFIRSTM vfirst_m_b16
#define UINT_V_T vuint64m4_t
#define VIDV_MASK_UINT vid_v_u64m4_m
#define VIDV_UINT vid_v_u64m4
#define VADDVX_MASK_UINT vadd_vx_u64m4_m
#define VADDVX_UINT vadd_vx_u64m4
#define VMVVX_UINT vmv_v_x_u64m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
#define VMVVXS_UINT vmv_x_s_u64m4_u64
#else
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFABSV_FLOAT vfabs_v_f32m4
#define VFMINVV_FLOAT vfmin_vv_f32m4
#define VFADDVV_FLOAT vfadd_vv_f32m4
#define VFIRSTM vfirst_m_b8
#define UINT_V_T vuint32m4_t
#define VIDV_MASK_UINT vid_v_u32m4_m
#define VIDV_UINT vid_v_u32m4
#define VADDVX_MASK_UINT vadd_vx_u32m4_m
#define VADDVX_UINT vadd_vx_u32m4
#define VMVVX_UINT vmv_v_x_u32m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
#define VMVVXS_UINT vmv_x_s_u32m4_u32
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
unsigned int min_index = 0;
if (n <= 0 || inc_x <= 0) return(min_index);
FLOAT_V_T vx0, vx1, v_min;
UINT_V_T v_min_index;
MASK_T mask;
size_t vlmax = VSETVL_MAX;
v_min_index = VMVVX_UINT(0, vlmax);
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
BLASLONG j=0;
FLOAT minf=0.0;
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
// index where element less than v_min
mask = VMFLTVV_FLOAT(vx0, v_min, vl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx0, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
// index where element less than v_min
mask = VMFLTVV_FLOAT(vx0, v_min, vl);
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
//update v_min and start_index j
v_min = VFMINVV_FLOAT(v_min, vx0, vl);
}
}
FLOAT_V_T_M1 v_res, v_max;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
minf = VFMVFS_FLOAT_M1(v_res);
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
min_index = VFIRSTM(mask, vlmax);
v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
min_index = VMVVXS_UINT(v_min_index);
return(min_index+1);
}

98
kernel/riscv64/max_rvv.c Normal file
View File

@ -0,0 +1,98 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return(maxf);
FLOAT_V_T vx, vmax;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
}
}
v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
maxf = VFMVFS_FLOAT_M1(v_res);
return(maxf);
}

98
kernel/riscv64/min_rvv.c Normal file
View File

@ -0,0 +1,98 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMINVV_FLOAT vfmin_vv_f32m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMINVV_FLOAT vfmin_vv_f64m8
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return(minf);
FLOAT_V_T vx, vmin;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vmin = VFMINVV_FLOAT(vmin, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vmin = VFMINVV_FLOAT(vmin, vx, vl);
}
}
v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
minf = VFMVFS_FLOAT_M1(v_res);
return(minf);
}

103
kernel/riscv64/nrm2_rvv.c Normal file
View File

@ -0,0 +1,103 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define ABS fabsf
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define ABS fabs
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
if( n <= 0 ) return(0.0);
if(n == 1) return (ABS(x[0]));
FLOAT_V_T vr, v0;
FLOAT_V_T_M1 v_res;
FLOAT ssq = 0.0;
size_t vlmax = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
vr = VFMVVF_FLOAT(0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
v0 = VLEV_FLOAT(x, vl);
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl * inc_x) {
vl = VSETVL(n);
v0 = VLSEV_FLOAT(x, stride_x, vl);
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
}
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
ssq = VFMVFS_FLOAT_M1(v_res);
return sqrt(ssq);
}

149
kernel/riscv64/rot_rvv.c Normal file
View File

@ -0,0 +1,149 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
#define VFMULVF_FLOAT vfmul_vf_f32m8
#define VFMSACVF_FLOAT vfmsac_vf_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
#define VFMULVF_FLOAT vfmul_vf_f64m8
#define VFMSACVF_FLOAT vfmsac_vf_f64m8
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
if(n <= 0) return(0);
FLOAT_V_T v0, v1, vx, vy;
if (inc_x == 0 || inc_y == 0) {
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
else if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLEV_FLOAT(y, vl);
v0 = VFMULVF_FLOAT(vx, c, vl);
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
VSEV_FLOAT(x, v0, vl);
v1 = VFMULVF_FLOAT(vx, s, vl);
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
VSEV_FLOAT(y, v1, vl);
}
} else if(inc_y == 1) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLEV_FLOAT(y, vl);
v0 = VFMULVF_FLOAT(vx, c, vl);
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
VSSEV_FLOAT(x, stride_x, v0, vl);
v1 = VFMULVF_FLOAT(vx, s, vl);
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
VSEV_FLOAT(y, v1, vl);
}
} else if(inc_x == 1) {
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
v0 = VFMULVF_FLOAT(vx, c, vl);
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
VSEV_FLOAT(x, v0, vl);
v1 = VFMULVF_FLOAT(vx, s, vl);
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
VSSEV_FLOAT(y, stride_y, v1, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
v0 = VFMULVF_FLOAT(vx, c, vl);
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
VSSEV_FLOAT(x, stride_x, v0, vl);
v1 = VFMULVF_FLOAT(vx, s, vl);
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
VSSEV_FLOAT(y, stride_y, v1, vl);
}
}
return(0);
}

80
kernel/riscv64/scal_rvv.c Normal file
View File

@ -0,0 +1,80 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMULVF_FLOAT vfmul_vf_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMULVF_FLOAT vfmul_vf_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
if ( (n <= 0) || (inc_x <= 0)) return(0);
FLOAT_V_T v0;
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
v0 = VLEV_FLOAT(x, vl);
v0 = VFMULVF_FLOAT(v0, da, vl);
VSEV_FLOAT(x, v0, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
v0 = VLSEV_FLOAT(x, stride_x, vl);
v0 = VFMULVF_FLOAT(v0, da, vl);
VSSEV_FLOAT(x, stride_x, v0, vl);
}
}
return 0;
}

95
kernel/riscv64/sum_rvv.c Normal file
View File

@ -0,0 +1,95 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFADDVV_FLOAT vfadd_vv_f32m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFADDVV_FLOAT vfadd_vv_f64m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
FLOAT_V_T vx, vsum;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vsum = VFMVVF_FLOAT(0.0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vsum = VFADDVV_FLOAT(vsum, vx, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vsum = VFADDVV_FLOAT(vsum, vx, vl);
}
}
v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax);
sumf = VFMVFS_FLOAT_M1(v_res);
return(sumf);
}

142
kernel/riscv64/swap_rvv.c Normal file
View File

@ -0,0 +1,142 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG stride_x, stride_y;
FLOAT_V_T vx, vy;
if (n <= 0) return(0);
if (inc_x == 0 && inc_y == 0) {
if (n & 1) {
FLOAT temp = x[0];
x[0] = y[0];
y[0] = temp;
}
else {
return 0;
}
}
else if(inc_x == 0) {
FLOAT temp = x[0];
x[0] = y[(n - 1) * inc_y];
FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one
stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse
BLASLONG m = n - 1;
for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) {
vl = VSETVL(m);
vy = VLSEV_FLOAT(ptr - 1, stride_y, vl);
VSSEV_FLOAT(ptr, stride_y, vy, vl);
}
y[0] = temp;
}
else if(inc_y == 0) {
FLOAT temp = y[0];
y[0] = x[(n - 1) * inc_x];
FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one
stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse
BLASLONG m = n - 1;
for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) {
vl = VSETVL(m);
vx = VLSEV_FLOAT(ptr - 1, stride_x, vl);
VSSEV_FLOAT(ptr, stride_x, vx, vl);
}
x[0] = temp;
}
else if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLEV_FLOAT(y, vl);
VSEV_FLOAT(y, vx, vl);
VSEV_FLOAT(x, vy, vl);
}
} else if (inc_y == 1) {
stride_x = inc_x * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLEV_FLOAT(y, vl);
VSEV_FLOAT(y, vx, vl);
VSSEV_FLOAT(x, stride_x, vy, vl);
}
} else if(inc_x == 1) {
stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
VSSEV_FLOAT(y, stride_y, vx, vl);
VSEV_FLOAT(x, vy, vl);
}
} else {
stride_x = inc_x * sizeof(FLOAT);
stride_y = inc_y * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VLSEV_FLOAT(y, stride_y, vl);
VSSEV_FLOAT(y, stride_y, vx, vl);
VSSEV_FLOAT(x, stride_x, vy, vl);
}
}
return(0);
}

View File

@ -0,0 +1,101 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define INT_V_T vint32m2_t
#define VID_V_INT vid_v_i32m2
#define VADD_VX_INT vadd_vx_i32m2
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
#define VBOOL_T vbool16_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define INT_V_T vint64m2_t
#define VID_V_INT vid_v_i64m2
#define VADD_VX_INT vadd_vx_i64m2
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
#define VBOOL_T vbool32_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
#endif
// Optimizes the implementation in ../generic/symm_lcopy_4.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
{
BLASLONG i, js, offset;
FLOAT *ao1, *ao2;
BLASLONG stride_lda = sizeof(FLOAT)*lda;
FLOAT_V_T vb, va1, va2;
VBOOL_T vbool;
INT_V_T vindex_max, vindex;
size_t vl = VSETVL_MAX;
vindex_max = VID_V_INT(vl);
for (js = n; js > 0; js -= vl, posX += vl) {
vl = VSETVL(js);
offset = posX - posY;
ao1 = a + posX + posY * lda;
ao2 = a + posY + (posX) * lda;
for (i = m; i > 0; i--, offset--) {
va2 = VLSEV_FLOAT(ao2, stride_lda, vl);
va1 = VLEV_FLOAT(ao1, vl);
// offset > (0 - vindex) ---> (offset + vindex) > 0
vindex = VADD_VX_INT(vindex_max, offset, vl);
vbool = VMSGT_VX_INT(vindex, 0, vl);
vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
VSEV_FLOAT(b, vb, vl);
b += vl;
ao1 += lda;
ao2++;
}
}
return 0;
}

View File

@ -0,0 +1,100 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define INT_V_T vint32m2_t
#define VID_V_INT vid_v_i32m2
#define VADD_VX_INT vadd_vx_i32m2
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
#define VBOOL_T vbool16_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define INT_V_T vint64m2_t
#define VID_V_INT vid_v_i64m2
#define VADD_VX_INT vadd_vx_i64m2
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
#define VBOOL_T vbool32_t
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
#endif
// Optimizes the implementation in ../generic/symm_ucopy_4.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
{
BLASLONG i, js, offset;
FLOAT *ao1, *ao2;
BLASLONG stride_lda = sizeof(FLOAT)*lda;
FLOAT_V_T vb, va1, va2;
VBOOL_T vbool;
INT_V_T vindex_max, vindex;
size_t vl = VSETVL_MAX;
vindex_max = VID_V_INT(vl);
for (js = n; js > 0; js -= vl, posX += vl) {
vl = VSETVL(js);
offset = posX - posY;
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posX + 0 + posY * lda;
for (i = m; i > 0; i--, offset--) {
va1 = VLSEV_FLOAT(ao1, stride_lda, vl);
va2 = VLEV_FLOAT(ao2, vl);
// offset > (0 - vindex) ---> (offset + vindex) > 0
vindex = VADD_VX_INT(vindex_max, offset, vl);
vbool = VMSGT_VX_INT(vindex, 0, vl);
vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
VSEV_FLOAT(b, vb, vl);
b += vl;
ao1++;
ao2 += lda;
}
}
return 0;
}

224
kernel/riscv64/symv_L_rvv.c Normal file
View File

@ -0,0 +1,224 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T_M1 vfloat32m1_t
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8
#define VFMULVF_FLOAT vfmul_vf_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMSACVF_FLOAT vfmsac_vf_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T_M1 vfloat64m1_t
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8
#define VFMULVF_FLOAT vfmul_vf_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMSACVF_FLOAT vfmsac_vf_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j, k;
BLASLONG ix,iy;
BLASLONG jx,jy;
FLOAT temp1;
FLOAT *a_ptr = a;
FLOAT_V_T_M1 v_res, v_z0;
size_t vlmax = VSETVL_MAX_M1, vl;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
vlmax = VSETVL_MAX;
FLOAT_V_T va, vx, vy, vr;
BLASLONG stride_x, stride_y, inc_xv, inc_yv;
if(inc_x == 1 && inc_y == 1)
{
for (j=0; j<offset; j++)
{
temp1 = alpha * x[j];
y[j] += temp1 * a_ptr[j];
i = j + 1;
vr = VFMVVF_FLOAT(0, vlmax);
for (k = (m-i); k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
vr = VFMVVF_FLOAT(0, vl);
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VLEV_FLOAT(&y[i], vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSEV_FLOAT(&y[i], vy, vl);
vx = VLEV_FLOAT(&x[i], vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
a_ptr += lda;
}
}
else if(inc_x == 1)
{
jy = 0;
stride_y = inc_y * sizeof(FLOAT);
for (j=0; j<offset; j++)
{
temp1 = alpha * x[j];
y[jy] += temp1 * a_ptr[j];
iy = jy + inc_y;
i = j + 1;
vr = VFMVVF_FLOAT(0, vlmax);
for (k = (m-i); k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
inc_yv = inc_y * vl;
vr = VFMVVF_FLOAT(0, vl);
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
vx = VLEV_FLOAT(&x[i], vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
jy += inc_y;
a_ptr += lda;
}
}
else if(inc_y == 1)
{
jx = 0;
stride_x = inc_x * sizeof(FLOAT);
for (j=0; j<offset; j++)
{
temp1 = alpha * x[jx];
y[j] += temp1 * a_ptr[j];
ix = jx + inc_x;
i = j + 1;
vr = VFMVVF_FLOAT(0, vlmax);
for (k = (m-i); k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
vr = VFMVVF_FLOAT(0, vl);
inc_xv = inc_x * vl;
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VLEV_FLOAT(&y[i], vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSEV_FLOAT(&y[i], vy, vl);
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
jx += inc_x;
a_ptr += lda;
}
}
else
{
stride_x = inc_x * sizeof(FLOAT);
stride_y = inc_y * sizeof(FLOAT);
jx = 0;
jy = 0;
for (j=0; j<offset; j++)
{
temp1 = alpha * x[jx];
y[jy] += temp1 * a_ptr[j];
ix = jx + inc_x;
iy = jy + inc_y;
i = j + 1;
vr = VFMVVF_FLOAT(0, vlmax);
for (k = (m-i); k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
inc_xv = inc_x * vl;
inc_yv = inc_y * vl;
vr = VFMVVF_FLOAT(0, vl);
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
ix += inc_xv;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
jx += inc_x;
jy += inc_y;
a_ptr += lda;
}
}
return(0);
}

221
kernel/riscv64/symv_U_rvv.c Normal file
View File

@ -0,0 +1,221 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T_M1 vfloat32m1_t
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT vle32_v_f32m8
#define VSEV_FLOAT vse32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VSSEV_FLOAT vsse32_v_f32m8
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8
#define VFMULVF_FLOAT vfmul_vf_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMSACVF_FLOAT vfmsac_vf_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T_M1 vfloat64m1_t
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle64_v_f64m8
#define VSEV_FLOAT vse64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VSSEV_FLOAT vsse64_v_f64m8
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8
#define VFMULVF_FLOAT vfmul_vf_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMSACVF_FLOAT vfmsac_vf_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j, k;
BLASLONG ix,iy;
BLASLONG jx,jy;
FLOAT temp1;
FLOAT *a_ptr = a;
FLOAT_V_T_M1 v_res, v_z0;
size_t vl_max = VSETVL_MAX_M1, vl;
v_res = VFMVVF_FLOAT_M1(0, vl_max);
v_z0 = VFMVVF_FLOAT_M1(0, vl_max);
vl_max = VSETVL_MAX;
FLOAT_V_T va, vx, vy, vr;
BLASLONG stride_x, stride_y, inc_xv, inc_yv;
BLASLONG m1 = m - offset;
if(inc_x == 1 && inc_y == 1)
{
a_ptr += m1 * lda;
for (j=m1; j<m; j++)
{
temp1 = alpha * x[j];
i = 0;
vr = VFMVVF_FLOAT(0, vl_max);
for (k = j; k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
vr = VFMVVF_FLOAT(0, vl);
vy = VLEV_FLOAT(&y[i], vl);
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSEV_FLOAT(&y[i], vy, vl);
vx = VLEV_FLOAT(&x[i], vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
a_ptr += lda;
}
}
else if(inc_x == 1)
{
jy = m1 * inc_y;
a_ptr += m1 * lda;
stride_y = inc_y * sizeof(FLOAT);
for (j=m1; j<m; j++)
{
temp1 = alpha * x[j];
iy = 0;
i = 0;
vr = VFMVVF_FLOAT(0, vl_max);
for (k = j; k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
inc_yv = inc_y * vl;
vr = VFMVVF_FLOAT(0, vl);
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
vx = VLEV_FLOAT(&x[i], vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
a_ptr += lda;
jy += inc_y;
}
}
else if(inc_y == 1)
{
jx = m1 * inc_x;
a_ptr += m1 * lda;
stride_x = inc_x * sizeof(FLOAT);
for (j=m1; j<m; j++)
{
temp1 = alpha * x[jx];
ix = 0;
i = 0;
vr = VFMVVF_FLOAT(0, vl_max);
for (k = j; k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
inc_xv = inc_x * vl;
vr = VFMVVF_FLOAT(0, vl);
vy = VLEV_FLOAT(&y[i], vl);
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSEV_FLOAT(&y[i], vy, vl);
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
a_ptr += lda;
jx += inc_x;
}
}
else
{
jx = m1 * inc_x;
jy = m1 * inc_y;
a_ptr += m1 * lda;
stride_x = inc_x * sizeof(FLOAT);
stride_y = inc_y * sizeof(FLOAT);
for (j=m1; j<m; j++)
{
temp1 = alpha * x[jx];
ix = 0;
iy = 0;
i = 0;
vr = VFMVVF_FLOAT(0, vl_max);
for (k = j; k > 0; k -= vl, i += vl)
{
vl = VSETVL(k);
inc_xv = inc_x * vl;
inc_yv = inc_y * vl;
vr = VFMVVF_FLOAT(0, vl);
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
va = VLEV_FLOAT(&a_ptr[i], vl);
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
ix += inc_xv;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
a_ptr += lda;
jx += inc_x;
jy += inc_y;
}
}
return(0);
}

View File

@ -0,0 +1,138 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vint32m2_t
#define VID_V_UINT vid_v_i32m2
#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, X;
FLOAT *ao;
BLASLONG stride_lda = sizeof(FLOAT)*lda;
FLOAT_V_T vb, va1;
size_t vl;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posY + posX * lda;
}
else
{
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY)
{
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
VSEV_FLOAT(b, va1, vl);
ao ++;
b += vl;
X ++;
i ++;
}
else if (X < posY)
{
ao += lda;
b += vl;
X ++;
i ++;
}
else
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
#endif
VSEV_FLOAT(b, vb, vl);
ao++;
b += vl;
}
X += vl;
i += vl;
}
} while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,134 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, X;
FLOAT *ao;
FLOAT_V_T vb, va1;
size_t vl;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posY + posX * lda;
}
else
{
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY)
{
ao ++;
b += vl;
X ++;
i ++;
}
else if (X < posY)
{
va1 = VLEV_FLOAT(ao, vl);
VSEV_FLOAT(b, va1, vl);
ao += lda;
b += vl;
X ++;
i ++;
}
else
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
va1 = VLEV_FLOAT(ao, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
#endif
VSEV_FLOAT(b, vb, vl);
ao += lda;
b += vl;
}
X += vl;
i += vl;
}
} while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,136 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, X;
BLASLONG stride_lda = sizeof(FLOAT) * lda;
FLOAT *ao;
FLOAT_V_T vb, va1;
size_t vl;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posX + posY * lda;
}
else
{
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY)
{
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
VSEV_FLOAT(b, va1, vl);
ao ++;
b += vl;
X ++;
i ++;
}
else if (X > posY)
{
ao += lda;
b += vl;
X ++;
i ++;
}
else
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
#endif
VSEV_FLOAT(b, vb, vl);
ao++;
b += vl;
}
X += vl;
i += vl;
}
}while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,133 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
#endif
// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, j, js, X;
FLOAT *ao;
FLOAT_V_T vb, va1;
#ifdef UNIT
VBOOL_T vbool_eq;
#endif
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
X = posX;
if (posX <= posY)
{
ao = a + posX + posY * lda;
}
else
{
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY)
{
ao ++;
b += vl;
X ++;
i++;
}
else if (X > posY)
{
va1 = VLEV_FLOAT(ao, vl);
VSEV_FLOAT(b, va1, vl);
ao += lda;
b += vl;
X++;
i++;
}
else
{
vindex = VID_V_UINT(vl);
for (j = 0; j < vl; j++)
{
va1 = VLEV_FLOAT(ao, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
#ifdef UNIT
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
#endif
VSEV_FLOAT(b, vb, vl);
ao += lda;
b += vl;
}
X += vl;
i += vl;
}
}while (i < m);
posY += vl;
}
return 0;
}

View File

@ -0,0 +1,685 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VFMVVF_FLOAT vfmv_v_f_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#endif
// Optimizes the implementation in ../generic/trmmkernel_8x8.c
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
{
//fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc);
BLASLONG i,j,k;
FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
size_t vl;
BLASLONG off, temp;
#if !defined(LEFT)
off = -offset;
#else
off = 0;
#endif
for (j = bn/8; j > 0; j--)
{
C0 = C;
C1 = C0+ldc;
C2 = C1+ldc;
C3 = C2+ldc;
C4 = C3+ldc;
C5 = C4+ldc;
C6 = C5+ldc;
C7 = C6+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*vl;
ptrbb = bb + off*8;
#endif
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
vres4 = VFMVVF_FLOAT(0.0, vl);
vres5 = VFMVVF_FLOAT(0.0, vl);
vres6 = VFMVVF_FLOAT(0.0, vl);
vres7 = VFMVVF_FLOAT(0.0, vl);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+vl; // number of values in A
#else
temp = off+8; // number of values in B
#endif
for (k = temp/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
ptrbb += 8;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
ptrbb += 8;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
ptrbb += 8;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
ptrbb += 8;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
ptrbb += 8;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
ptrbb += 8;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
ptrbb += 8;
}
for (k = temp&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
ptrbb += 8;
ptrba += vl;
}
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
VSEV_FLOAT(C0, va0, vl);
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
VSEV_FLOAT(C1, va1, vl);
va2 = VFMULVF_FLOAT(vres2, alpha, vl);
VSEV_FLOAT(C2, va2, vl);
va3 = VFMULVF_FLOAT(vres3, alpha, vl);
VSEV_FLOAT(C3, va3, vl);
va4 = VFMULVF_FLOAT(vres4, alpha, vl);
VSEV_FLOAT(C4, va4, vl);
va5 = VFMULVF_FLOAT(vres5, alpha, vl);
VSEV_FLOAT(C5, va5, vl);
va6 = VFMULVF_FLOAT(vres6, alpha, vl);
VSEV_FLOAT(C6, va6, vl);
va7 = VFMULVF_FLOAT(vres7, alpha, vl);
VSEV_FLOAT(C7, va7, vl);
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= vl; // number of values in A
#else
temp -= 8; // number of values in B
#endif
ptrba += temp*vl;
ptrbb += temp*8;
#endif
#ifdef LEFT
off += vl; // number of values in A
#endif
C0 += vl;
C1 += vl;
C2 += vl;
C3 += vl;
C4 += vl;
C5 += vl;
C6 += vl;
C7 += vl;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 8;
#endif
bb += (bk<<3);
C += (ldc<<3);
}
if (bn & 4)
{
C0 = C;
C1 = C0+ldc;
C2 = C1+ldc;
C3 = C2+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*vl;
ptrbb = bb + off*4;
#endif
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
vres2 = VFMVVF_FLOAT(0.0, vl);
vres3 = VFMVVF_FLOAT(0.0, vl);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+vl; // number of values in A
#else
temp = off+4; // number of values in B
#endif
for (k = temp/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
ptrbb += 4;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
ptrbb += 4;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
ptrbb += 4;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
ptrbb += 4;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
ptrbb += 4;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
ptrbb += 4;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
ptrbb += 4;
}
// K remainder
for (k = temp&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
ptrbb += 4;
ptrba += vl;
}
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
VSEV_FLOAT(C0, va0, vl);
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
VSEV_FLOAT(C1, va1, vl);
va2 = VFMULVF_FLOAT(vres2, alpha, vl);
VSEV_FLOAT(C2, va2, vl);
va3 = VFMULVF_FLOAT(vres3, alpha, vl);
VSEV_FLOAT(C3, va3, vl);
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= vl; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*vl;
ptrbb += temp*4;
#endif
#ifdef LEFT
off += vl; // number of values in A
#endif
C0 += vl;
C1 += vl;
C2 += vl;
C3 += vl;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 4;
#endif
bb += (bk<<2);
C += (ldc<<2);
}
if (bn & 2)
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*vl;
ptrbb = bb + off*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vl);
vres1 = VFMVVF_FLOAT(0.0, vl);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+vl; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k = temp/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
ptrbb += 2;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
ptrbb += 2;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
ptrbb += 2;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
ptrbb += 2;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
ptrbb += 2;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
ptrbb += 2;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
ptrbb += 2;
}
// K remainder
for (k = temp&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
ptrbb += 2;
ptrba += vl;
}
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
VSEV_FLOAT(C0, va0, vl);
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
VSEV_FLOAT(C1, va1, vl);
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= vl; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*vl;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += vl; // number of values in A
#endif
C0 += vl;
C1 += vl;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
bb += (bk<<1);
C += (ldc<<1);
}
if (bn & 1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm; i > 0; i -= vl)
{
vl = VSETVL(i);
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*vl;
ptrbb = bb + off*1;
#endif
vres0 = VFMVVF_FLOAT(0.0, vl);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+vl; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k = temp/8; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
va1 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
ptrbb += 1;
va2 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
ptrbb += 1;
va3 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
ptrbb += 1;
va4 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
ptrbb += 1;
va5 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
ptrbb += 1;
va6 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
ptrbb += 1;
va7 = VLEV_FLOAT(ptrba, vl);
ptrba += vl;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
ptrbb += 1;
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
ptrbb += 1;
}
// K remainder
for (k = temp&7; k > 0; k--) {
va0 = VLEV_FLOAT(ptrba, vl);
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
ptrbb += 1;
ptrba += vl;
}
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
VSEV_FLOAT(C0, va0, vl);
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= vl; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*vl;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += vl; // number of values in A
#endif
C0 += vl;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
bb += (bk);
C += (ldc);
}
return 0;
}

View File

@ -0,0 +1,847 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#endif
static FLOAT dm1 = -1.;
#ifdef CONJ
#define GEMM_KERNEL GEMM_KERNEL_L
#else
#define GEMM_KERNEL GEMM_KERNEL_N
#endif
#if GEMM_DEFAULT_UNROLL_N == 1
#define GEMM_UNROLL_N_SHIFT 0
#endif
#if GEMM_DEFAULT_UNROLL_N == 2
#define GEMM_UNROLL_N_SHIFT 1
#endif
#if GEMM_DEFAULT_UNROLL_N == 4
#define GEMM_UNROLL_N_SHIFT 2
#endif
#if GEMM_DEFAULT_UNROLL_N == 8
#define GEMM_UNROLL_N_SHIFT 3
#endif
#if GEMM_DEFAULT_UNROLL_N == 16
#define GEMM_UNROLL_N_SHIFT 4
#endif
// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT *pa, *pc;
int i, j, k;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
size_t vl;
FLOAT_V_T va, vc;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
aa = *(a + i);
for (j = 0; j < n; j ++)
{
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b ++;
pa = a;
pc = c + j * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLEV_FLOAT(pc, vl);
va = VLEV_FLOAT(pa, vl);
vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
VSEV_FLOAT(pc, vc, vl);
pa += vl;
pc += vl;
}
}
a -= m;
b -= 2 * n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb0, bb1;
FLOAT *pa, *pc, *pc0, *pc1;
FLOAT *pb0, *pb1;
int i, j, k;
fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
size_t vl;
FLOAT_V_T va, vc0, vc1;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/2; j ++)
{
//bb = *(c + i + j * ldc);
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
//bb *= aa;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
//*b = bb;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
//*(c + i + j * ldc) = bb;
//b ++;
b += 2;
//pa = a + i + 1;
pc0 = c + j * ldc * 2;
pc1 = pc0 + ldc;
pa = a;
//pc = c + j * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
}
pc += ldc * (n/2) * 2;
if (n & 1)
{
pb0 = pc;
bb0 = (*pb0) * aa;
*b = bb0;
*pb0 = bb0;
b += 1;
pc0 = pc - i;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a -= m;
b -= 2 * n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb0, bb1, bb2, bb3;
FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3;
FLOAT *pb0, *pb1, *pb2, *pb3;
int i, j, k;
size_t vl;
FLOAT_V_T va, vc0, vc1, vc2, vc3;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/4; j ++)
{
pb0 = pc + j * ldc * 4;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pc0 = c + j * ldc * 4;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
}
pc += ldc * (n/4) * 4;
if (n & 2)
{
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pc0 = c + j * ldc * 2;
pc1 = pc0 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = (*pb0) * aa;
*b = bb0;
*pb0 = bb0;
b += 1;
pc0 = pc - i;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a -= m;
b -= 2 * n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
int i, j, k;
size_t vl;
FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/8; j ++)
{
pb0 = pc + j * ldc * 8;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
pb4 = pb3 + ldc;
pb5 = pb4 + ldc;
pb6 = pb5 + ldc;
pb7 = pb6 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
bb4 = (*pb4) * aa;
bb5 = (*pb5) * aa;
bb6 = (*pb6) * aa;
bb7 = (*pb7) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*(b+4) = bb4;
*(b+5) = bb5;
*(b+6) = bb6;
*(b+7) = bb7;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
*pb4 = bb4;
*pb5 = bb5;
*pb6 = bb6;
*pb7 = bb7;
b += 8;
pc0 = c + j * ldc * 8;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pc4 = pc3 + ldc;
pc5 = pc4 + ldc;
pc6 = pc5 + ldc;
pc7 = pc6 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
vc4 = VLEV_FLOAT(pc4, vl);
vc5 = VLEV_FLOAT(pc5, vl);
vc6 = VLEV_FLOAT(pc6, vl);
vc7 = VLEV_FLOAT(pc7, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
VSEV_FLOAT(pc4, vc4, vl);
VSEV_FLOAT(pc5, vc5, vl);
VSEV_FLOAT(pc6, vc6, vl);
VSEV_FLOAT(pc7, vc7, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
pc4 += vl;
pc5 += vl;
pc6 += vl;
pc7 += vl;
}
}
pc += ldc * (n/8) * 8;
if (n & 4)
{
pb0 = pc + j * ldc * 4;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pc0 = c + j * ldc * 4;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
pc += ldc * 4;
}
if (n & 2)
{
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pc0 = c + j * ldc * 2;
pc1 = pc0 + ldc;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = (*pb0) * aa;
*b = bb0;
*pb0 = bb0;
b += 1;
pc0 = pc - i;
pa = a;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a -= m;
b -= 2 * n;
}
}
#else
static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
int i, j, k;
a += (m - 1) * m;
b += (m - 1) * n;
for (i = m - 1; i >= 0; i--) {
aa = *(a + i);
for (j = 0; j < n; j ++) {
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b ++;
for (k = 0; k < i; k ++){
*(c + k + j * ldc) -= bb * *(a + k);
}
}
a -= m;
b -= 2 * n;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
int i, j, k;
ldc *= 2;
a += (m - 1) * m * 2;
b += (m - 1) * n * 2;
for (i = m - 1; i >= 0; i--) {
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = aa1 * bb2 - aa2 * bb1;
#endif
*(b + 0) = cc1;
*(b + 1) = cc2;
*(c + i * 2 + 0 + j * ldc) = cc1;
*(c + i * 2 + 1 + j * ldc) = cc2;
b += 2;
for (k = 0; k < i; k ++){
#ifndef CONJ
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
#else
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
#endif
}
}
a -= m * 2;
b -= 4 * n;
}
}
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
#ifdef COMPLEX
FLOAT dummy2,
#endif
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
BLASLONG i, j;
FLOAT *aa, *cc;
BLASLONG kk;
size_t vl = VSETVL_MAX;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
j = (n >> GEMM_UNROLL_N_SHIFT);
while (j > 0) {
kk = m + offset;
i = m % vl;
if (i) {
aa = a + (m - i) * k * COMPSIZE;
cc = c + (m - i) * COMPSIZE;
if (k - kk > 0) {
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
ldc);
}
solve(i, GEMM_UNROLL_N,
aa + (kk - i) * i * COMPSIZE,
b + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
kk -= i;
}
int mod = i;
i = vl;
if (i <= m) {
aa = a + (m - mod - vl) * k * COMPSIZE;
cc = c + (m - mod - vl) * COMPSIZE;
do {
if (k - kk > 0) {
GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + vl * kk * COMPSIZE,
b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
ldc);
}
solve(vl, GEMM_UNROLL_N,
aa + (kk - vl) * vl * COMPSIZE,
b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
aa -= vl * k * COMPSIZE;
cc -= vl * COMPSIZE;
kk -= vl;
i += vl;
} while (i <= m);
}
b += GEMM_UNROLL_N * k * COMPSIZE;
c += GEMM_UNROLL_N * ldc * COMPSIZE;
j --;
}
if (n & (GEMM_UNROLL_N - 1)) {
j = (GEMM_UNROLL_N >> 1);
while (j > 0) {
if (n & j) {
kk = m + offset;
i = m % vl;
if (i) {
aa = a + (m - i) * k * COMPSIZE;
cc = c + (m - i) * COMPSIZE;
if (k - kk > 0) {
GEMM_KERNEL(i, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
b + j * kk * COMPSIZE,
cc, ldc);
}
solve(i, j,
aa + (kk - i) * i * COMPSIZE,
b + (kk - i) * j * COMPSIZE,
cc, ldc);
kk -= i;
}
int mod = i;
i = vl;
if (i <= m) {
aa = a + (m - mod - vl) * k * COMPSIZE;
cc = c + (m - mod - vl) * COMPSIZE;
do {
if (k - kk > 0) {
GEMM_KERNEL(vl, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + vl * kk * COMPSIZE,
b + j * kk * COMPSIZE,
cc,
ldc);
}
solve(vl, j,
aa + (kk - vl) * vl * COMPSIZE,
b + (kk - vl) * j * COMPSIZE,
cc, ldc);
aa -= vl * k * COMPSIZE;
cc -= vl * COMPSIZE;
kk -= vl;
i += vl;
} while (i <= m);
}
b += j * k * COMPSIZE;
c += j * ldc * COMPSIZE;
}
j >>= 1;
}
}
return 0;
}

View File

@ -0,0 +1,840 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFMULVF_FLOAT vfmul_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFMULVF_FLOAT vfmul_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#endif
static FLOAT dm1 = -1.;
#ifdef CONJ
#define GEMM_KERNEL GEMM_KERNEL_L
#else
#define GEMM_KERNEL GEMM_KERNEL_N
#endif
#if GEMM_DEFAULT_UNROLL_N == 1
#define GEMM_UNROLL_N_SHIFT 0
#endif
#if GEMM_DEFAULT_UNROLL_N == 2
#define GEMM_UNROLL_N_SHIFT 1
#endif
#if GEMM_DEFAULT_UNROLL_N == 4
#define GEMM_UNROLL_N_SHIFT 2
#endif
#if GEMM_DEFAULT_UNROLL_N == 8
#define GEMM_UNROLL_N_SHIFT 3
#endif
#if GEMM_DEFAULT_UNROLL_N == 16
#define GEMM_UNROLL_N_SHIFT 4
#endif
// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb;
FLOAT *pa, *pc;
int i, j, k;
size_t vl;
FLOAT_V_T va, vc;
for (i = 0; i < m; i++)
{
aa = *(a + i);
for (j = 0; j < n; j ++)
{
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b++;
pa = a + i + 1;
pc = c + j * ldc + i + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLEV_FLOAT(pc, vl);
va = VLEV_FLOAT(pa, vl);
vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
VSEV_FLOAT(pc, vc, vl);
pa += vl;
pc += vl;
}
}
a += m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb0, bb1;
FLOAT *pa, *pc, *pc0, *pc1;
FLOAT *pb0, *pb1;
int i, j, k;
size_t vl;
FLOAT_V_T va, vc0, vc1;
for (i = 0; i < m; i++)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/2; j ++)
{
pb0 = pc + j * ldc * 2;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
}
pc += ldc * (n/2) * 2;
if (n & 1)
{
pb0 = pc;
bb0 = *(pb0);
bb0 *= aa;
*b = bb0;
*(c + i) = bb0;
b++;
pa = a + i + 1;
pc0 = pb0 + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a += m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb0, bb1, bb2, bb3;
FLOAT *pa, *pc;
FLOAT *pc0, *pc1, *pc2, *pc3;
FLOAT *pb0, *pb1, *pb2, *pb3;
int i, j, k;
size_t vl;
FLOAT_V_T va;
FLOAT_V_T vc0, vc1, vc2, vc3;
for (i = 0; i < m; i++)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/4; j ++)
{
pb0 = pc;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
}
pc += ldc * (n/4) * 4;
if (n & 2)
{
pb0 = pc;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = *(pb0);
bb0 *= aa;
*b = bb0;
*(c + i) = bb0;
b++;
pa = a + i + 1;
pc0 = pb0 + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a += m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
{
FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
FLOAT *pa, *pc;
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
int i, j, k;
size_t vl;
FLOAT_V_T va;
FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
for (i = 0; i < m; i++)
{
aa = *(a + i);
pc = c + i;
for (j = 0; j < n/8; j ++)
{
pb0 = pc + j * ldc * 8;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
pb4 = pb3 + ldc;
pb5 = pb4 + ldc;
pb6 = pb5 + ldc;
pb7 = pb6 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
bb4 = (*pb4) * aa;
bb5 = (*pb5) * aa;
bb6 = (*pb6) * aa;
bb7 = (*pb7) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*(b+4) = bb4;
*(b+5) = bb5;
*(b+6) = bb6;
*(b+7) = bb7;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
*pb4 = bb4;
*pb5 = bb5;
*pb6 = bb6;
*pb7 = bb7;
b += 8;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
pc4 = pc3 + ldc;
pc5 = pc4 + ldc;
pc6 = pc5 + ldc;
pc7 = pc6 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
vc4 = VLEV_FLOAT(pc4, vl);
vc5 = VLEV_FLOAT(pc5, vl);
vc6 = VLEV_FLOAT(pc6, vl);
vc7 = VLEV_FLOAT(pc7, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
VSEV_FLOAT(pc4, vc4, vl);
VSEV_FLOAT(pc5, vc5, vl);
VSEV_FLOAT(pc6, vc6, vl);
VSEV_FLOAT(pc7, vc7, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
pc4 += vl;
pc5 += vl;
pc6 += vl;
pc7 += vl;
}
}
pc += ldc * (n/8) * 8;
if (n & 4)
{
pb0 = pc;
pb1 = pb0 + ldc;
pb2 = pb1 + ldc;
pb3 = pb2 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
bb2 = (*pb2) * aa;
bb3 = (*pb3) * aa;
*b = bb0;
*(b+1) = bb1;
*(b+2) = bb2;
*(b+3) = bb3;
*pb0 = bb0;
*pb1 = bb1;
*pb2 = bb2;
*pb3 = bb3;
b += 4;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
pc2 = pc1 + ldc;
pc3 = pc2 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
vc2 = VLEV_FLOAT(pc2, vl);
vc3 = VLEV_FLOAT(pc3, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
VSEV_FLOAT(pc2, vc2, vl);
VSEV_FLOAT(pc3, vc3, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
pc2 += vl;
pc3 += vl;
}
pc += ldc * 4;
}
if (n & 2)
{
pb0 = pc;
pb1 = pb0 + ldc;
bb0 = (*pb0) * aa;
bb1 = (*pb1) * aa;
*b = bb0;
*(b+1) = bb1;
*pb0 = bb0;
*pb1 = bb1;
b += 2;
pa = a + i + 1;
pc0 = pb0 + 1;
pc1 = pc0 + ldc;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
vc1 = VLEV_FLOAT(pc1, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
VSEV_FLOAT(pc1, vc1, vl);
pa += vl;
pc0 += vl;
pc1 += vl;
}
pc += ldc * 2;
}
if (n & 1)
{
pb0 = pc;
bb0 = *(pb0);
bb0 *= aa;
*b = bb0;
*(c + i) = bb0;
b++;
pa = a + i + 1;
pc0 = pb0 + 1;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLEV_FLOAT(pc0, vl);
va = VLEV_FLOAT(pa, vl);
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
VSEV_FLOAT(pc0, vc0, vl);
pa += vl;
pc0 += vl;
}
}
a += m;
}
}
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
int i, j, k;
for (i = 0; i < m; i++) {
aa = *(a + i);
for (j = 0; j < n; j ++) {
bb = *(c + i + j * ldc);
bb *= aa;
*b = bb;
*(c + i + j * ldc) = bb;
b ++;
for (k = i + 1; k < m; k ++){
*(c + k + j * ldc) -= bb * *(a + k);
}
}
a += m;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
int i, j, k;
ldc *= 2;
for (i = 0; i < m; i++) {
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = aa1 * bb2 - aa2 * bb1;
#endif
*(b + 0) = cc1;
*(b + 1) = cc2;
*(c + i * 2 + 0 + j * ldc) = cc1;
*(c + i * 2 + 1 + j * ldc) = cc2;
b += 2;
for (k = i + 1; k < m; k ++){
#ifndef CONJ
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
#else
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
#endif
}
}
a += m * 2;
}
}
static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
FLOAT *pa, *pc;
int i, j, k;
size_t vl;
FLOAT_V_T va0, va1, vc0, vc1;
ldc *= 2;
for (i = 0; i < m; i++) {
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = aa1 * bb2 - aa2 * bb1;
#endif
*(b + 0) = cc1;
*(b + 1) = cc2;
*(c + i * 2 + 0 + j * ldc) = cc1;
*(c + i * 2 + 1 + j * ldc) = cc2;
b += 2;
pa = a + (i + 1) * 2;
pc = c + j * ldc + (i + 1) * 2;
for (k = (m - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
VLSEG2_FLOAT(&va0, &va1, pa, vl);
VLSEG2_FLOAT(&vc0, &vc1, pc, vl);
#ifndef CONJ
vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
vc0 = VFMACCVF_FLOAT(vc0, cc2, va1);
vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1);
vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
#else
vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1);
vc1 = VFMACCVF_FLOAT(vc1, cc1, va1);
vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
#endif
VSSEG2_FLOAT(pc, vc0, vc1, vl);
pa += vl * 2;
pc += vl * 2;
}
}
}
a += m * 2;
}
}
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
#ifdef COMPLEX
FLOAT dummy2,
#endif
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
FLOAT *aa, *cc;
BLASLONG kk;
BLASLONG i, j;
size_t vl = VSETVL_MAX;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
j = (n >> GEMM_UNROLL_N_SHIFT);
while (j > 0) {
kk = offset;
aa = a;
cc = c;
i = vl;
while (i <= m) {
if (kk > 0) {
GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa, b, cc, ldc);
}
solve(vl, GEMM_UNROLL_N,
aa + kk * vl * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
aa += vl * k * COMPSIZE;
cc += vl * COMPSIZE;
kk += vl;
i += vl;
}
i = m % vl;
if (i) {
if (kk > 0) {
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa, b, cc, ldc);
}
solve(i, GEMM_UNROLL_N,
aa + kk * i * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
kk += i;
}
b += GEMM_UNROLL_N * k * COMPSIZE;
c += GEMM_UNROLL_N * ldc * COMPSIZE;
j --;
}
if (n & (GEMM_UNROLL_N - 1)) {
j = (GEMM_UNROLL_N >> 1);
while (j > 0) {
if (n & j) {
kk = offset;
aa = a;
cc = c;
i = vl;
while (i <= m) {
if (kk > 0) {
GEMM_KERNEL(vl, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
b,
cc,
ldc);
}
solve(vl, j,
aa + kk * vl * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += vl * k * COMPSIZE;
cc += vl * COMPSIZE;
kk += vl;
i += vl;
}
i = m % vl;
if (i) {
if (kk > 0) {
GEMM_KERNEL(i, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
b,
cc,
ldc);
}
solve(i, j,
aa + kk * i * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
kk += i;
}
b += j * k * COMPSIZE;
c += j * ldc * COMPSIZE;
}
j >>= 1;
}
}
return 0;
}

View File

@ -0,0 +1,792 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#endif
static FLOAT dm1 = -1.;
#ifdef CONJ
#define GEMM_KERNEL GEMM_KERNEL_R
#else
#define GEMM_KERNEL GEMM_KERNEL_N
#endif
#if GEMM_DEFAULT_UNROLL_N == 1
#define GEMM_UNROLL_N_SHIFT 0
#endif
#if GEMM_DEFAULT_UNROLL_N == 2
#define GEMM_UNROLL_N_SHIFT 1
#endif
#if GEMM_DEFAULT_UNROLL_N == 4
#define GEMM_UNROLL_N_SHIFT 2
#endif
#if GEMM_DEFAULT_UNROLL_N == 8
#define GEMM_UNROLL_N_SHIFT 3
#endif
#if GEMM_DEFAULT_UNROLL_N == 16
#define GEMM_UNROLL_N_SHIFT 4
#endif
// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT *pb, *pc;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc;
for (i = 0; i < n; i++)
{
bb = *(b + i);
for (j = 0; j < m; j ++)
{
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
pb = b + i + 1;
pc = c + j + (i + 1) *ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLSEV_FLOAT(pc, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
VSSEV_FLOAT(pc, stride_ldc, vc, vl);
pb += vl;
pc ++;
}
}
b += n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pc0, *pc1;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1;
for (i = 0; i < n; i++)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/2; j ++)
{
pa0 = pc + j * 2;
pa1 = pc + j * 2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
}
pc += (m/2)*2;
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b + i + 1;
pc0 = pa0 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b += n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT bb;
FLOAT aa0, aa1, aa2, aa3;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3;
FLOAT *pc0, *pc1, *pc2, *pc3;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3;
for (i = 0; i < n; i++)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/4; j ++)
{
pa0 = pc + j * 4;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
pc2 = pa2 + ldc;
pc3 = pa3 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
}
pc += (m/4)*4;
if (m & 2)
{
pa0 = pc;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b + i + 1;
pc0 = pa0 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b += n;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT bb;
FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
for (i = 0; i < n; i++)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/8; j ++)
{
pa0 = pc + j * 8;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
pa4 = pa3 + 1;
pa5 = pa4 + 1;
pa6 = pa5 + 1;
pa7 = pa6 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
aa4 = *pa4 * bb;
aa5 = *pa5 * bb;
aa6 = *pa6 * bb;
aa7 = *pa7 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*pa4 = aa4;
*pa5 = aa5;
*pa6 = aa6;
*pa7 = aa7;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
*(a + 4)= aa4;
*(a + 5)= aa5;
*(a + 6)= aa6;
*(a + 7)= aa7;
a += 8;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
pc2 = pa2 + ldc;
pc3 = pa3 + ldc;
pc4 = pa4 + ldc;
pc5 = pa5 + ldc;
pc6 = pa6 + ldc;
pc7 = pa7 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
pc4++;
pc5++;
pc6++;
pc7++;
}
}
pc += (m/8)*8;
if (m & 4)
{
pa0 = pc;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
pc2 = pa2 + ldc;
pc3 = pa3 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
pc += 4;
}
if (m & 2)
{
pa0 = pc;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b + i + 1;
pc0 = pa0 + ldc;
pc1 = pa1 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b + i + 1;
pc0 = pa0 + ldc;
for (k = (n - i - 1); k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b += n;
}
}
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
int i, j, k;
for (i = 0; i < n; i++) {
bb = *(b + i);
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
for (k = i + 1; k < n; k ++){
*(c + j + k * ldc) -= aa * *(b + k);
}
}
b += n;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
int i, j, k;
ldc *= 2;
for (i = 0; i < n; i++) {
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
for (j = 0; j < m; j ++) {
aa1 = *(c + j * 2 + 0 + i * ldc);
aa2 = *(c + j * 2 + 1 + i * ldc);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = -aa1 * bb2 + aa2 * bb1;
#endif
*(a + 0) = cc1;
*(a + 1) = cc2;
*(c + j * 2 + 0 + i * ldc) = cc1;
*(c + j * 2 + 1 + i * ldc) = cc2;
a += 2;
for (k = i + 1; k < n; k ++){
#ifndef CONJ
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
#else
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
#endif
}
}
b += n * 2;
}
}
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
#ifdef COMPLEX
FLOAT dummy2,
#endif
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
FLOAT *aa, *cc;
BLASLONG kk;
BLASLONG i, j;
size_t vl = VSETVL_MAX;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
j = (n >> GEMM_UNROLL_N_SHIFT);
kk = -offset;
while (j > 0) {
aa = a;
cc = c;
i = vl;
if (i <= m) {
do {
if (kk > 0) {
GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa, b, cc, ldc);
}
solve(vl, GEMM_UNROLL_N,
aa + kk * vl * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
aa += vl * k * COMPSIZE;
cc += vl * COMPSIZE;
i += vl;
} while (i <= m);
}
i = m % vl;
if (i) {
if (kk > 0) {
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa, b, cc, ldc);
}
solve(i, GEMM_UNROLL_N,
aa + kk * i * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
}
kk += GEMM_UNROLL_N;
b += GEMM_UNROLL_N * k * COMPSIZE;
c += GEMM_UNROLL_N * ldc * COMPSIZE;
j --;
}
if (n & (GEMM_UNROLL_N - 1)) {
j = (GEMM_UNROLL_N >> 1);
while (j > 0) {
if (n & j) {
aa = a;
cc = c;
i = vl;
while (i <= m) {
if (kk > 0) {
GEMM_KERNEL(vl, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
b,
cc,
ldc);
}
solve(vl, j,
aa + kk * vl * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += vl * k * COMPSIZE;
cc += vl * COMPSIZE;
i += vl;
}
i = m % vl;
if (i) {
if (kk > 0) {
GEMM_KERNEL(i, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
b,
cc,
ldc);
}
solve(i, j,
aa + kk * i * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
}
b += j * k * COMPSIZE;
c += j * ldc * COMPSIZE;
kk += j;
}
j >>= 1;
}
}
return 0;
}

View File

@ -0,0 +1,828 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEV_FLOAT vlse32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSSEV_FLOAT vsse32_v_f32m2
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEV_FLOAT vlse64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSSEV_FLOAT vsse64_v_f64m2
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
#endif
static FLOAT dm1 = -1.;
#ifdef CONJ
#define GEMM_KERNEL GEMM_KERNEL_R
#else
#define GEMM_KERNEL GEMM_KERNEL_N
#endif
#if GEMM_DEFAULT_UNROLL_N == 1
#define GEMM_UNROLL_N_SHIFT 0
#endif
#if GEMM_DEFAULT_UNROLL_N == 2
#define GEMM_UNROLL_N_SHIFT 1
#endif
#if GEMM_DEFAULT_UNROLL_N == 4
#define GEMM_UNROLL_N_SHIFT 2
#endif
#if GEMM_DEFAULT_UNROLL_N == 8
#define GEMM_UNROLL_N_SHIFT 3
#endif
#if GEMM_DEFAULT_UNROLL_N == 16
#define GEMM_UNROLL_N_SHIFT 4
#endif
// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c
#ifndef COMPLEX
#if GEMM_DEFAULT_UNROLL_N == 1
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
FLOAT *pb, *pc;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--) {
bb = *(b + i);
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
pb = b;
pc = c + j;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc = VLSEV_FLOAT(pc, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
VSSEV_FLOAT(pc, stride_ldc, vc, vl);
pb += vl;
pc++;
}
}
b -= n;
a -= 2 * m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 2
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pc0, *pc1;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/2; j ++)
{
pa0 = pc + j * 2;
pa1 = pc + j * 2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b;
pc0 = c + j * 2;
pc1 = pc0 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
}
pc += (m/2)*2;
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b;
pc0 = pc - i * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b -= n;
a -= 2 * m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 4
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, aa2, aa3;
FLOAT bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3;
FLOAT *pc0, *pc1, *pc2, *pc3;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/4; j ++)
{
pa0 = pc + j * 4;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b;
pc0 = c + j * 4;
pc1 = pc0 + 1;
pc2 = pc1 + 1;
pc3 = pc2 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
}
pc += (m/4)*4;
if (m & 2)
{
pa0 = pc + j * 2;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b;
pc0 = c + j * 4;
pc1 = pc0 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b;
pc0 = pc - i * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b -= n;
a -= 2 * m;
}
}
#elif GEMM_DEFAULT_UNROLL_N == 8
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
FLOAT bb;
FLOAT *pb, *pc;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
int i, j, k;
size_t vl;
FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--)
{
bb = *(b + i);
pc = c + i * ldc;
for (j = 0; j < m/8; j ++)
{
pa0 = pc + j * 8;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
pa4 = pa3 + 1;
pa5 = pa4 + 1;
pa6 = pa5 + 1;
pa7 = pa6 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
aa4 = *pa4 * bb;
aa5 = *pa5 * bb;
aa6 = *pa6 * bb;
aa7 = *pa7 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*pa4 = aa4;
*pa5 = aa5;
*pa6 = aa6;
*pa7 = aa7;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
*(a + 4)= aa4;
*(a + 5)= aa5;
*(a + 6)= aa6;
*(a + 7)= aa7;
a += 8;
pb = b;
pc0 = c + j * 8;
pc1 = pc0 + 1;
pc2 = pc1 + 1;
pc3 = pc2 + 1;
pc4 = pc3 + 1;
pc5 = pc4 + 1;
pc6 = pc5 + 1;
pc7 = pc6 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
pc4++;
pc5++;
pc6++;
pc7++;
}
}
pc += (m/8)*8;
if (m & 4)
{
pa0 = pc;
pa1 = pa0 + 1;
pa2 = pa1 + 1;
pa3 = pa2 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
aa2 = *pa2 * bb;
aa3 = *pa3 * bb;
*pa0 = aa0;
*pa1 = aa1;
*pa2 = aa2;
*pa3 = aa3;
*a = aa0;
*(a + 1)= aa1;
*(a + 2)= aa2;
*(a + 3)= aa3;
a += 4;
pb = b;
pc0 = pc - i * ldc;
pc1 = pc0 + 1;
pc2 = pc1 + 1;
pc3 = pc2 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
pb += vl;
pc0++;
pc1++;
pc2++;
pc3++;
}
pc += 4;
}
if (m & 2)
{
pa0 = pc;
pa1 = pa0 + 1;
aa0 = *pa0 * bb;
aa1 = *pa1 * bb;
*pa0 = aa0;
*pa1 = aa1;
*a = aa0;
*(a + 1)= aa1;
a += 2;
pb = b;
pc0 = pc - i * ldc;
pc1 = pc0 + 1;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
pb += vl;
pc0++;
pc1++;
}
pc += 2;
}
if (m & 1)
{
pa0 = pc;
aa0 = *pa0 * bb;
*pa0 = aa0;
*a = aa0;
a += 1;
pb = b;
pc0 = pc - i * ldc;
for (k = i; k > 0; k -= vl)
{
vl = VSETVL(k);
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
vb = VLEV_FLOAT(pb, vl);
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
pb += vl;
pc0++;
}
}
b -= n;
a -= 2 * m;
}
}
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa, bb;
int i, j, k;
a += (n - 1) * m;
b += (n - 1) * n;
for (i = n - 1; i >= 0; i--) {
bb = *(b + i);
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
*a = aa;
*(c + j + i * ldc) = aa;
a ++;
for (k = 0; k < i; k ++){
*(c + j + k * ldc) -= aa * *(b + k);
}
}
b -= n;
a -= 2 * m;
}
}
#endif
#else
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT aa1, aa2;
FLOAT bb1, bb2;
FLOAT cc1, cc2;
int i, j, k;
ldc *= 2;
a += (n - 1) * m * 2;
b += (n - 1) * n * 2;
for (i = n - 1; i >= 0; i--) {
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
for (j = 0; j < m; j ++) {
aa1 = *(c + j * 2 + 0 + i * ldc);
aa2 = *(c + j * 2 + 1 + i * ldc);
#ifndef CONJ
cc1 = aa1 * bb1 - aa2 * bb2;
cc2 = aa1 * bb2 + aa2 * bb1;
#else
cc1 = aa1 * bb1 + aa2 * bb2;
cc2 = - aa1 * bb2 + aa2 * bb1;
#endif
*(a + 0) = cc1;
*(a + 1) = cc2;
*(c + j * 2 + 0 + i * ldc) = cc1;
*(c + j * 2 + 1 + i * ldc) = cc2;
a += 2;
for (k = 0; k < i; k ++){
#ifndef CONJ
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
#else
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
*(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
#endif
}
}
b -= n * 2;
a -= 4 * m;
}
}
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
#ifdef COMPLEX
FLOAT dummy2,
#endif
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
BLASLONG i, j;
FLOAT *aa, *cc;
BLASLONG kk;
size_t vl = VSETVL_MAX;
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
kk = n - offset;
c += n * ldc * COMPSIZE;
b += n * k * COMPSIZE;
if (n & (GEMM_UNROLL_N - 1)) {
j = 1;
while (j < GEMM_UNROLL_N) {
if (n & j) {
aa = a;
b -= j * k * COMPSIZE;
c -= j * ldc* COMPSIZE;
cc = c;
i = vl;
if (i <= m) {
do {
if (k - kk > 0) {
GEMM_KERNEL(vl, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + vl * kk * COMPSIZE,
b + j * kk * COMPSIZE,
cc,
ldc);
}
solve(vl, j,
aa + (kk - j) * vl * COMPSIZE,
b + (kk - j) * j * COMPSIZE,
cc, ldc);
aa += vl * k * COMPSIZE;
cc += vl * COMPSIZE;
i += vl;
} while (i <= m);
}
i = m % vl;
if (i) {
if (k - kk > 0) {
GEMM_KERNEL(i, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
b + j * kk * COMPSIZE,
cc, ldc);
}
solve(i, j,
aa + (kk - j) * i * COMPSIZE,
b + (kk - j) * j * COMPSIZE,
cc, ldc);
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
}
kk -= j;
}
j <<= 1;
}
}
j = (n >> GEMM_UNROLL_N_SHIFT);
if (j > 0) {
do {
aa = a;
b -= GEMM_UNROLL_N * k * COMPSIZE;
c -= GEMM_UNROLL_N * ldc * COMPSIZE;
cc = c;
i = vl;
if (i <= m) {
do {
if (k - kk > 0) {
GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + vl * kk * COMPSIZE,
b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
ldc);
}
solve(vl, GEMM_UNROLL_N,
aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE,
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
aa += vl * k * COMPSIZE;
cc += vl * COMPSIZE;
i += vl;
} while (i <= m);
}
i = m % vl;
if (i) {
if (k - kk > 0) {
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
ldc);
}
solve(i, GEMM_UNROLL_N,
aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE,
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
}
kk -= GEMM_UNROLL_N;
j --;
} while (j > 0);
}
return 0;
}

View File

@ -0,0 +1,122 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSEV_FLOAT_M vse32_v_f32m2_m
#define VLSEV_FLOAT vlse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSEV_FLOAT_M vse64_v_f64m2_m
#define VLSEV_FLOAT vlse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#endif
#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif
// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, jj, js;
FLOAT *ao;
jj = offset;
BLASLONG stride_lda = sizeof(FLOAT)*lda;
FLOAT_V_T va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
*(b + j) = INV(*(ao + j * lda));
ao++;
b += vl;
}
i += vl;
ii += vl;
}
else
{
if (ii > jj)
{
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
VSEV_FLOAT(b, va1, vl);
}
ao++;
b += vl;
i++;
ii++;
}
}
a += vl * lda;
jj += vl;
}
return 0;
}

View File

@ -0,0 +1,122 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSEV_FLOAT_M vse32_v_f32m2_m
#define VLSEV_FLOAT vlse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSEV_FLOAT_M vse64_v_f64m2_m
#define VLSEV_FLOAT vlse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#endif
#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif
// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, jj, js;
FLOAT *ao;
jj = offset;
FLOAT_V_T va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
*(b + j) = INV(*(ao + j));
va1 = VLEV_FLOAT(ao, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
b += vl;
ao += lda;
}
i += vl;
ii += vl;
}
else
{
if (ii < jj)
{
va1 = VLEV_FLOAT(ao, vl);
VSEV_FLOAT(b, va1, vl);
}
ao += lda;
b += vl;
i ++;
ii ++;
}
}
a += vl;
jj += vl;
}
return 0;
}

View File

@ -0,0 +1,121 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSEV_FLOAT_M vse32_v_f32m2_m
#define VLSEV_FLOAT vlse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSEV_FLOAT_M vse64_v_f64m2_m
#define VLSEV_FLOAT vlse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
#endif
#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif
// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, jj, js;
BLASLONG stride_lda = sizeof(FLOAT)*lda;
FLOAT *ao;
jj = offset;
FLOAT_V_T va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
i = 0;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
*(b + j) = INV(*(ao + j * lda));
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
ao++;
b += vl;
}
i += vl;
ii += vl;
}
else
{
if (ii < jj)
{
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
VSEV_FLOAT(b, va1, vl);
}
ao++;
b += vl;
i++;
ii++;
}
}
a += vl * lda;
jj += vl;
}
return 0;
}

View File

@ -0,0 +1,123 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT vle32_v_f32m2
#define VSEV_FLOAT vse32_v_f32m2
#define VSEV_FLOAT_M vse32_v_f32m2_m
#define VLSEV_FLOAT vlse32_v_f32m2
#define VBOOL_T vbool16_t
#define UINT_V_T vuint32m2_t
#define VID_V_UINT vid_v_u32m2
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT vle64_v_f64m2
#define VSEV_FLOAT vse64_v_f64m2
#define VSEV_FLOAT_M vse64_v_f64m2_m
#define VLSEV_FLOAT vlse64_v_f64m2
#define VBOOL_T vbool32_t
#define UINT_V_T vuint64m2_t
#define VID_V_UINT vid_v_u64m2
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
#endif
#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif
// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, jj, js;
FLOAT *ao;
jj = offset;
FLOAT_V_T va1;
VBOOL_T vbool_cmp;
UINT_V_T vindex;
size_t vl;
for (js = n; js > 0; js -= vl)
{
vl = VSETVL(js);
ao = a;
ii = 0;
for (i = 0; i < m;)
{
if (ii == jj)
{
vindex = VID_V_UINT(vl);
for (unsigned int j = 0; j < vl; j++)
{
va1 = VLEV_FLOAT(ao, vl);
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
*(b + j) = INV(*(ao + j));
ao += lda;
b += vl;
}
i += vl;
ii += vl;
}
else
{
if (ii > jj)
{
va1 = VLEV_FLOAT(ao, vl);
VSEV_FLOAT(b, va1, vl);
}
ao += lda;
b += vl;
i ++;
ii ++;
}
}
a += vl;
jj += vl;
}
return 0;
}

113
kernel/riscv64/zamax_rvv.c Normal file
View File

@ -0,0 +1,113 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMAXVV_FLOAT vfmax_vv_f32m4
#define VFADDVV_FLOAT vfadd_vv_f32m4
#define VFABSV_FLOAT vfabs_v_f32m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMAXVV_FLOAT vfmax_vv_f64m4
#define VFADDVV_FLOAT vfadd_vv_f64m4
#define VFABSV_FLOAT vfabs_v_f64m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT maxf=0.0;
if (n <= 0 || inc_x <= 0) return(maxf);
FLOAT_V_T v0, v1, vmax;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vmax = VFMVVF_FLOAT(0.0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&v0, &v1, x, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v0 = VFADDVV_FLOAT(v0, v1, vl);
vmax = VFMAXVV_FLOAT(vmax, v0, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v0 = VFADDVV_FLOAT(v0, v1, vl);
vmax = VFMAXVV_FLOAT(vmax, v0, vl);
}
}
v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
maxf = VFMVFS_FLOAT_M1(v_res);
return(maxf);
}

112
kernel/riscv64/zamin_rvv.c Normal file
View File

@ -0,0 +1,112 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <float.h>
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMINVV_FLOAT vfmin_vv_f32m4
#define VFADDVV_FLOAT vfadd_vv_f32m4
#define VFABSV_FLOAT vfabs_v_f32m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMINVV_FLOAT vfmin_vv_f64m4
#define VFADDVV_FLOAT vfadd_vv_f64m4
#define VFABSV_FLOAT vfabs_v_f64m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT minf=0.0;
if (n <= 0 || inc_x <= 0) return(minf);
FLOAT_V_T v0, v1, vmin;
FLOAT_V_T_M1 v_res;
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
size_t vlmax = VSETVL_MAX;
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&v0, &v1, x, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v0 = VFADDVV_FLOAT(v0, v1, vl);
vmin = VFMINVV_FLOAT(vmin, v0, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v0 = VFADDVV_FLOAT(v0, v1, vl);
vmin = VFMINVV_FLOAT(vmin, v0, vl);
}
}
v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
minf = VFMVFS_FLOAT_M1(v_res);
return(minf);
}

108
kernel/riscv64/zasum_rvv.c Normal file
View File

@ -0,0 +1,108 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m8(n)
#define VSETVL_MAX vsetvlmax_e32m8()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m8
#define VLSEV_FLOAT vlse32_v_f32m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VFADDVV_FLOAT vfadd_vv_f32m8
#define VFABSV_FLOAT vfabs_v_f32m8
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m8()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m8
#define VLSEV_FLOAT vlse64_v_f64m8
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VFADDVV_FLOAT vfadd_vv_f64m8
#define VFABSV_FLOAT vfabs_v_f64m8
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT asumf = 0.0;
if (n <= 0 || inc_x <= 0) return(asumf);
FLOAT_V_T v0, v1;
size_t vlmax = VSETVL_MAX;
FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
v0 = VLEV_FLOAT(x, vl);
v1 = VLEV_FLOAT(x+vl, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
}
}
else {
int stride_x = inc_x * sizeof(FLOAT) * 2;
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
v0 = VLSEV_FLOAT(x, stride_x, vl);
v1 = VLSEV_FLOAT(x+1, stride_x, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
}
}
FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
asumf += VFMVFS_FLOAT_M1(v_res);
return(asumf);
}

151
kernel/riscv64/zaxpby_rvv.c Normal file
View File

@ -0,0 +1,151 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/***************************************************************************
* 2014/06/07 Saar
*
***************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFMSACVF_FLOAT vfmsac_vf_f32m4
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VSSEG_FLOAT vsseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFMSACVF_FLOAT vfmsac_vf_f64m4
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VSSEG_FLOAT vsseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
#endif
int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
{
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
BLASLONG stride_x = inc_x2 * sizeof(FLOAT);
BLASLONG stride_y = inc_y2 * sizeof(FLOAT);
FLOAT_V_T vx0, vx1, vy0, vy1;
if ( beta_r == 0.0 && beta_i == 0.0)
{
if ( alpha_r == 0.0 && alpha_i == 0.0 )
{
size_t vl = VSETVL(n);
FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl);
for ( ; n > 0; n -= vl, y += vl*stride_y)
{
vl = VSETVL(n);
VSSSEG_FLOAT(y, stride_y, temp, temp, vl);
}
}
else
{
for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2)
{
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl);
vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl);
vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl);
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
}
}
}
else
{
FLOAT_V_T v0, v1;
if ( alpha_r == 0.0 && alpha_i == 0.0 )
{
for (size_t vl; n > 0; n -= vl, y += vl*inc_y2)
{
vl = VSETVL(n);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
v0 = VFMULVF_FLOAT(vy1, beta_i, vl);
v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl);
v1 = VFMULVF_FLOAT(vy1, beta_r, vl);
v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
}
}
else
{
for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2)
{
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
v0 = VFMULVF_FLOAT(vx0, alpha_r, vl);
v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl);
v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl);
v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl);
v1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl);
v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl);
v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
}
}
}
return(0);
}

154
kernel/riscv64/zaxpy_rvv.c Normal file
View File

@ -0,0 +1,154 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VSSEG_FLOAT vsseg2e32_v_f32m4
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VSSEG_FLOAT vsseg2e64_v_f64m4
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
if(n < 0) return(0);
if(da_r == 0.0 && da_i == 0.0) return(0);
FLOAT_V_T vx0, vx1, vy0, vy1;
if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
#if !defined(CONJ)
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#endif
VSSEG_FLOAT(y, vy0, vy1, vl);
}
} else if (inc_x == 1) {
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
#if !defined(CONJ)
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#endif
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
}
} else if (inc_y == 1) {
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
#if !defined(CONJ)
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#endif
VSSEG_FLOAT(y, vy0, vy1, vl);
}
} else {
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
#if !defined(CONJ)
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
#endif
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
}
}
return(0);
}

105
kernel/riscv64/zcopy_rvv.c Normal file
View File

@ -0,0 +1,105 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL_M8(n) vsetvl_e32m8(n)
#define FLOAT_V_T_M8 vfloat32m8_t
#define VLEV_FLOAT_M8 vle32_v_f32m8
#define VSEV_FLOAT_M8 vse32_v_f32m8
#define VSETVL_M4(n) vsetvl_e32m4(n)
#define FLOAT_V_T_M4 vfloat32m4_t
#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4
#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4
#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4
#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4
#else
#define VSETVL_M8(n) vsetvl_e64m8(n)
#define FLOAT_V_T_M8 vfloat64m8_t
#define VLEV_FLOAT_M8 vle64_v_f64m8
#define VSEV_FLOAT_M8 vse64_v_f64m8
#define VSETVL_M4(n) vsetvl_e64m4(n)
#define FLOAT_V_T_M4 vfloat64m4_t
#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4
#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4
#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4
#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
if(n < 0) return(0);
if(inc_x == 1 && inc_y == 1) {
FLOAT_V_T_M8 vx;
n *= 2; // convert to words
for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL_M8(n);
vx = VLEV_FLOAT_M8(x, vl);
VSEV_FLOAT_M8(y, vx, vl);
}
}else if (1 == inc_x) {
FLOAT_V_T_M4 vr, vi;
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
vl = VSETVL_M4(n);
VLSEG_FLOAT_M4(&vr, &vi, x, vl);
VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
}
} else if (1 == inc_y) {
FLOAT_V_T_M4 vr, vi;
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
vl = VSETVL_M4(n);
VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
VSSEG_FLOAT_M4(y, vr, vi, vl);
}
} else {
FLOAT_V_T_M4 vr, vi;
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
vl = VSETVL_M4(n);
VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
}
}
return(0);
}

170
kernel/riscv64/zdot_rvv.c Normal file
View File

@ -0,0 +1,170 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VFMSACVV_FLOAT vfmsac_vv_f32m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VFMSACVV_FLOAT vfmsac_vv_f64m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
OPENBLAS_COMPLEX_FLOAT result;
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
if ( n <= 0 ) return(result);
FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
FLOAT_V_T_M1 v_res, v_z0;
size_t vlmax_m1 = VSETVL_MAX_M1;
v_res = VFMVVF_FLOAT_M1(0, vlmax_m1);
v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1);
size_t vlmax = VSETVL_MAX;
vr0 = VFMVVF_FLOAT(0, vlmax);
vr1 = VFMVVF_FLOAT(0, vlmax);
if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
#if !defined(CONJ)
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
#else
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
#endif
}
} else if (inc_x == 1){
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
#if !defined(CONJ)
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
#else
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
#endif
}
} else if (inc_y == 1){
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
#if !defined(CONJ)
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
#else
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
#endif
}
}else {
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
#if !defined(CONJ)
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
#else
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
#endif
}
}
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax);
CREAL(result) = VFMVFS_FLOAT_M1(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax);
CIMAG(result) = VFMVFS_FLOAT_M1(v_res);
return(result);
}

View File

@ -0,0 +1,117 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VSSEG_FLOAT vsseg2e32_v_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFADDVV_FLOAT vfadd_vv_f32m4
#define VFSUBVV_FLOAT vfsub_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VSSEG_FLOAT vsseg2e64_v_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFADDVV_FLOAT vfadd_vv_f64m4
#define VFSUBVV_FLOAT vfsub_vv_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
FLOAT beta_r, FLOAT beta_i,
FLOAT *dummy2, BLASLONG dummy3,
FLOAT *dummy4, BLASLONG dummy5,
FLOAT *c, BLASLONG ldc)
{
BLASLONG chunk;
FLOAT *c_offset;
size_t vl;
FLOAT_V_T vr, vi, v1, v2, v3, v4;
ldc *= 2;
c_offset = c;
if (beta_r == 0.0 && beta_i == 0.0) {
vl = VSETVL(m);
vr = VFMVVF_FLOAT(0.0, vl);
vi = VFMVVF_FLOAT(0.0, vl);
for( ; n > 0; n--, c += ldc) {
c_offset = c;
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
vl = VSETVL(chunk);
VSSEG_FLOAT(c_offset, vr, vi, vl);
}
}
} else {
for( ; n > 0; n--, c += ldc) {
c_offset = c;
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
vl = VSETVL(chunk);
VLSEG_FLOAT(&vr, &vi, c_offset, vl);
v1 = VFMULVF_FLOAT(vr, beta_r, vl);
v2 = VFMULVF_FLOAT(vi, beta_i, vl);
v3 = VFMULVF_FLOAT(vi, beta_r, vl);
v4 = VFMULVF_FLOAT(vr, beta_i, vl);
vr = VFSUBVV_FLOAT(v1, v2, vl);
vi = VFADDVV_FLOAT(v3, v4, vl);
VSSEG_FLOAT(c_offset, vr, vi, vl);
}
}
}
return 0;
}

View File

@ -0,0 +1,170 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VSSEG_FLOAT vsseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VSSEG_FLOAT vsseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix;
FLOAT *a_ptr;
FLOAT temp_r, temp_i;
FLOAT_V_T va0, va1, vy0, vy1;
BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
BLASLONG inc_x2 = inc_x * 2;
BLASLONG lda2 = lda * 2;
if (inc_y == 1)
{
for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) {
vl = VSETVL(m);
a_ptr = a;
ix = 0;
VLSEG_FLOAT(&vy0, &vy1, y, vl);
for(i = 0; i < n; i++){
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
#if !defined(CONJ)
#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
#endif
#else
#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
#endif
#endif
a_ptr += lda2;
ix += inc_x2;
}
VSSEG_FLOAT(y, vy0, vy1, vl);
}
}
else
{
for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) {
vl = VSETVL(m);
a_ptr = a;
ix = 0;
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
for(i = 0; i < n; i++){
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
#if !defined(CONJ)
#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
#endif
#else
#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
#else
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
#endif
#endif
a_ptr += lda2;
ix += inc_x2;
}
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
}
}
return(0);
}

View File

@ -0,0 +1,172 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMULVV_FLOAT vfmul_vv_f32m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMULVV_FLOAT vfmul_vv_f64m4
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i = 0, j = 0;
BLASLONG ix = 0, iy = 0;
FLOAT *a_ptr = a;
FLOAT temp_r, temp_i;
FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
FLOAT_V_T_M1 v_res, v_z0;
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
//BLASLONG stride_a = sizeof(FLOAT) * 2;
BLASLONG inc_y2 = inc_y * 2;
BLASLONG lda2 = lda * 2;
size_t vlmax = VSETVL_MAX_M1;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
vlmax = VSETVL(m);
if (inc_x == 1)
{
for(i = 0; i < n; i++) {
j = 0;
ix = 0;
vr = VFMVVF_FLOAT(0, vlmax);
vi = VFMVVF_FLOAT(0, vlmax);
for(size_t vl, k = m; k > 0; k -= vl) {
vl = VSETVL(k);
VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl);
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
#else
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
#endif
j += vl * 2;
ix += vl * inc_x * 2;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
temp_r = VFMVFS_FLOAT_M1(v_res);
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
temp_i = VFMVFS_FLOAT_M1(v_res);
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
iy += inc_y2;
a_ptr += lda2;
}
}
else
{
for(i = 0; i < n; i++) {
j = 0;
ix = 0;
vr = VFMVVF_FLOAT(0, vlmax);
vi = VFMVVF_FLOAT(0, vlmax);
for(size_t vl, k = m; k > 0; k -= vl) {
vl = VSETVL(k);
VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl);
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
#else
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
#endif
j += vl * 2;
ix += vl * inc_x * 2;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
temp_r = VFMVFS_FLOAT_M1(v_res);
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
temp_i = VFMVFS_FLOAT_M1(v_res);
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
iy += inc_y2;
a_ptr += lda2;
}
}
return(0);
}

122
kernel/riscv64/znrm2_rvv.c Normal file
View File

@ -0,0 +1,122 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VFABSV_FLOAT vfabs_v_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VFABSV_FLOAT vfabs_v_f64m4
#endif
// TODO: Should single precision use the widening MAC, or perhaps all should be double?
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
if ( n <= 0 ) return(0.0);
FLOAT_V_T vr, v0, v1;
FLOAT_V_T_M1 v_max, v_res;
FLOAT scale = 0.0, ssq = 0.0;
size_t vlmax = VSETVL_MAX;
v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_max = VFMVVF_FLOAT_M1(0, vlmax);
vr = VFMVVF_FLOAT(0, vlmax);
if (inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&v0, &v1, x, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
v0 = VFABSV_FLOAT(v0, vl);
v1 = VFABSV_FLOAT(v1, vl);
v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
}
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
ssq = VFMVFS_FLOAT_M1(v_res);
scale = VFMVFS_FLOAT_M1(v_max);
ssq = ssq / (scale*scale);
return(scale * sqrt(ssq));
}

181
kernel/riscv64/zrot_rvv.c Normal file
View File

@ -0,0 +1,181 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT vle32_v_f32m4
#define VLSEV_FLOAT vlse32_v_f32m4
#define VSEV_FLOAT vse32_v_f32m4
#define VSSEV_FLOAT vsse32_v_f32m4
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VSSEG_FLOAT vsseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT vle64_v_f64m4
#define VLSEV_FLOAT vlse64_v_f64m4
#define VSEV_FLOAT vse64_v_f64m4
#define VSSEV_FLOAT vsse64_v_f64m4
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VSSEG_FLOAT vsseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
if (n <= 0) return(0);
FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
if (inc_x == 0 && inc_y == 0) {
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
else if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
vt0 = VFMULVF_FLOAT(vx0, c, vl);
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
vt1 = VFMULVF_FLOAT(vx1, c, vl);
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
vy0 = VFMULVF_FLOAT(vy0, c, vl);
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
vy1 = VFMULVF_FLOAT(vy1, c, vl);
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
VSSEG_FLOAT(x, vt0, vt1, vl);
VSSEG_FLOAT(y, vy0, vy1, vl);
}
} else if (inc_x == 1){
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
vt0 = VFMULVF_FLOAT(vx0, c, vl);
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
vt1 = VFMULVF_FLOAT(vx1, c, vl);
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
vy0 = VFMULVF_FLOAT(vy0, c, vl);
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
vy1 = VFMULVF_FLOAT(vy1, c, vl);
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
VSSEG_FLOAT(x, vt0, vt1, vl);
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
}
} else if (inc_y == 1){
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
vt0 = VFMULVF_FLOAT(vx0, c, vl);
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
vt1 = VFMULVF_FLOAT(vx1, c, vl);
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
vy0 = VFMULVF_FLOAT(vy0, c, vl);
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
vy1 = VFMULVF_FLOAT(vy1, c, vl);
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
VSSEG_FLOAT(y, vy0, vy1, vl);
}
} else {
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
vt0 = VFMULVF_FLOAT(vx0, c, vl);
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
vt1 = VFMULVF_FLOAT(vx1, c, vl);
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
vy0 = VFMULVF_FLOAT(vy0, c, vl);
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
vy1 = VFMULVF_FLOAT(vy1, c, vl);
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
}
}
return 0;
}

148
kernel/riscv64/zscal_rvv.c Normal file
View File

@ -0,0 +1,148 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define FLOAT_V_T vfloat32m4_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VSSEG_FLOAT vsseg2e32_v_f32m4
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMULVF_FLOAT vfmul_vf_f32m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define FLOAT_V_T vfloat64m4_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VSSEG_FLOAT vsseg2e64_v_f64m4
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMULVF_FLOAT vfmul_vf_f64m4
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
if((n <= 0) || (inc_x <= 0)) return(0);
FLOAT_V_T vt, vr, vi;
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
size_t vlmax = VSETVL_MAX;
if(da_r == 0.0 && da_i == 0.0) {
vr = VFMVVF_FLOAT(0.0, vlmax);
vi = VFMVVF_FLOAT(0.0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
VSSEG_FLOAT(x, vr, vi, vl);
}
} else {
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
}
}
} else if(da_r == 0.0) {
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
vt = VFMULVF_FLOAT(vi, -da_i, vl);
vi = VFMULVF_FLOAT(vr, da_i, vl);
VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
}
} else if(da_i == 0.0) {
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
vr = VFMULVF_FLOAT(vr, da_r, vl);
vi = VFMULVF_FLOAT(vi, da_r, vl);
VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
}
} else {
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vr, &vi, x, vl);
vt = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
vi = VFMULVF_FLOAT(vi, da_r, vl);
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
VSSEG_FLOAT(x, vt, vi, vl);
}
} else {
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
vt = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
vi = VFMULVF_FLOAT(vi, da_r, vl);
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
}
}
}
return(0);
}

97
kernel/riscv64/zsum_rvv.c Normal file
View File

@ -0,0 +1,97 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m4()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#define VFADDVV_FLOAT vfadd_vv_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m4()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#define VFADDVV_FLOAT vfadd_vv_f64m4
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
FLOAT_V_T v0, v1;
size_t vlmax = VSETVL_MAX;
FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
if(inc_x == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&v0, &v1, x, vl);
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
}
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
}
}
FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
sumf += VFMVFS_FLOAT_M1(v_res);
return(sumf);
}

156
kernel/riscv64/zswap_rvv.c Normal file
View File

@ -0,0 +1,156 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m4(n)
#define FLOAT_V_T vfloat32m4_t
#define VLSEG_FLOAT vlseg2e32_v_f32m4
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
#define VSSEG_FLOAT vsseg2e32_v_f32m4
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
#else
#define VSETVL(n) vsetvl_e64m4(n)
#define FLOAT_V_T vfloat64m4_t
#define VLSEG_FLOAT vlseg2e64_v_f64m4
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
#define VSSEG_FLOAT vsseg2e64_v_f64m4
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
if (n <= 0) return(0);
FLOAT_V_T vx0, vx1, vy0, vy1;
if (inc_x == 0 && inc_y == 0) {
if (n & 1) {
FLOAT temp[2];
temp[0] = x[0];
temp[1] = x[1];
x[0] = y[0];
x[1] = y[1];
y[0] = temp[0];
y[1] = temp[1];
}
else {
return 0;
}
}
else if(inc_x == 0) {
FLOAT temp[2];
temp[0] = x[0];
temp[1] = x[1];
x[0] = y[(n - 1) * inc_y * 2];
x[0] = y[(n - 1) * inc_y * 2 + 1];
FLOAT* ptr = y + (n - 1) * inc_y * 2; // start from the last one
BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse
BLASLONG m = n - 1;
for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) {
vl = VSETVL(m);
VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl);
VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl);
}
y[0] = temp[0];
y[1] = temp[1];
}
else if(inc_y == 0) {
FLOAT temp[2];
temp[0] = y[0];
temp[1] = y[1];
y[0] = x[(n - 1) * inc_x * 2];
y[0] = x[(n - 1) * inc_x * 2 + 1];
FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one
BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse
BLASLONG m = n - 1;
for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) {
vl = VSETVL(m);
VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl);
VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl);
}
x[0] = temp[0];
x[1] = temp[1];
}
else if(inc_x == 1 && inc_y == 1) {
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
VSSEG_FLOAT(y, vx0, vx1, vl);
VSSEG_FLOAT(x, vy0, vy1, vl);
}
} else if (inc_x == 1){
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSEG_FLOAT(&vx0, &vx1, x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
VSSEG_FLOAT(x, vy0, vy1, vl);
}
} else if (inc_y == 1){
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSEG_FLOAT(&vy0, &vy1, y, vl);
VSSEG_FLOAT(y, vx0, vx1, vl);
VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
}
} else {
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
vl = VSETVL(n);
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
}
}
return(0);
}

View File

@ -0,0 +1,596 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) vsetvl_e32m2(n)
#define VSETVL_MAX vsetvlmax_e32m2()
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m2_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle32_v_f32m2
#define VLSEG4_FLOAT vlseg4e32_v_f32m2
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
#define VFMVVF_FLOAT vfmv_v_f_f32m2
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
#define VFMACCVV_FLOAT vfmacc_vv_f32m2
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
#else
#define VSETVL(n) vsetvl_e64m2(n)
#define VSETVL_MAX vsetvlmax_e64m2()
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m2_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle64_v_f64m2
#define VLSEG4_FLOAT vlseg4e64_v_f64m2
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
#define VFMVVF_FLOAT vfmv_v_f_f64m2
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
#define VFMACCVV_FLOAT vfmacc_vv_f64m2
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
#endif
// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
FLOAT* C,BLASLONG ldc, BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1;
BLASLONG off, temp;
FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
FLOAT_V_T_M1 v_m1_res0, v_m1_res1;
FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
size_t vl;
size_t vlmax = VSETVL_MAX;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#else
off = 0;
#endif
for (j = bn/2; j > 0; j--)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i = bm/2; i > 0; i--)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vlmax);
vres1 = VFMVVF_FLOAT(0.0, vlmax);
vres2 = VFMVVF_FLOAT(0.0, vlmax);
vres3 = VFMVVF_FLOAT(0.0, vlmax);
vres4 = VFMVVF_FLOAT(0.0, vlmax);
vres5 = VFMVVF_FLOAT(0.0, vlmax);
vres6 = VFMVVF_FLOAT(0.0, vlmax);
vres7 = VFMVVF_FLOAT(0.0, vlmax);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 2;
#endif
for (k = temp; k > 0; k -= vl)
{
vl = VSETVL(k);
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl);
vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl);
vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
#endif
ptrba += vl * 4;
ptrbb += vl * 4;
}
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C0[0] = res0 * alphar - res1 * alphai;
C0[1] = res1 * alphar + res0 * alphai;
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C0[2] = res0 * alphar - res1 * alphai;
C0[3] = res1 * alphar + res0 * alphai;
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C1[0] = res0 * alphar - res1 * alphai;
C1[1] = res1 * alphar + res0 * alphai;
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C1[2] = res0 * alphar - res1 * alphai;
C1[3] = res1 * alphar + res0 * alphai;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
C1 = C1+4;
}
if (bm & 1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vlmax);
vres1 = VFMVVF_FLOAT(0.0, vlmax);
vres2 = VFMVVF_FLOAT(0.0, vlmax);
vres3 = VFMVVF_FLOAT(0.0, vlmax);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k = temp; k > 0; k -= vl)
{
vl = VSETVL(k);
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
#endif
ptrba += vl * 2;
ptrbb += vl * 4;
}
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C0[0] = res0 * alphar - res1 * alphai;
C0[1] = res1 * alphar + res0 * alphai;
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C1[0] = res0 * alphar - res1 * alphai;
C1[1] = res1 * alphar + res0 * alphai;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
C1 = C1+2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
if (bn & 1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i = bm/2; i > 0; i--)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vlmax);
vres1 = VFMVVF_FLOAT(0.0, vlmax);
vres2 = VFMVVF_FLOAT(0.0, vlmax);
vres3 = VFMVVF_FLOAT(0.0, vlmax);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 1;
#endif
for (k = temp; k > 0; k -= vl)
{
vl = VSETVL(k);
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
#endif
ptrba += vl * 4;
ptrbb += vl * 2;
}
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C0[0] = res0 * alphar - res1 * alphai;
C0[1] = res1 * alphar + res0 * alphai;
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C0[2] = res0 * alphar - res1 * alphai;
C0[3] = res1 * alphar + res0 * alphai;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
}
if (bm & 1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
vres0 = VFMVVF_FLOAT(0.0, vlmax);
vres1 = VFMVVF_FLOAT(0.0, vlmax);
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k = temp; k > 0; k -= vl)
{
vl = VSETVL(k);
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
#endif
ptrba += vl * 2;
ptrbb += vl * 2;
}
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
C0[0] = res0 * alphar - res1 * alphai;
C0[1] = res1 * alphar + res0 * alphai;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

44
param.h
View File

@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#if defined(x280)
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2
#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2
/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
* Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
* If VLMAX size is ever more than 1024, this should be increased also. */
#define SGEMM_DEFAULT_UNROLL_MN 32
#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8
#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4
#define DGEMM_DEFAULT_UNROLL_MN 32
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 160
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 96
#define ZGEMM_DEFAULT_P 64
#define SGEMM_DEFAULT_Q 240
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 120
#define ZGEMM_DEFAULT_Q 120
#define SGEMM_DEFAULT_R 12288
#define DGEMM_DEFAULT_R 8192
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#define SYMV_P 16
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#endif
#ifdef C910V
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0