From bef47917bd72f35c151038fee0cf485445476863 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 00:06:25 -0800 Subject: [PATCH 1/5] Initial version for riscv sifive x280 --- Makefile.install | 7 + Makefile.prebuild | 8 + Makefile.riscv64 | 8 + README.md | 5 + TargetList.txt | 1 + benchmark/Makefile | 6 + common_riscv64.h | 4 + cpuid_riscv64.c | 2 + getarch.c | 12 + kernel/riscv64/KERNEL.x280 | 267 ++++++++ kernel/riscv64/amax_rvv.c | 102 +++ kernel/riscv64/amin_rvv.c | 102 +++ kernel/riscv64/asum_rvv.c | 99 +++ kernel/riscv64/axpby_rvv.c | 171 +++++ kernel/riscv64/axpy_rvv.c | 109 +++ kernel/riscv64/copy_rvv.c | 94 +++ kernel/riscv64/dot_rvv.c | 126 ++++ kernel/riscv64/gemm_beta_rvv.c | 89 +++ kernel/riscv64/gemm_ncopy_2_rvv.c | 92 +++ kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ++++ kernel/riscv64/gemm_ncopy_8_rvv.c | 164 +++++ kernel/riscv64/gemm_ncopy_rvv_v1.c | 76 +++ kernel/riscv64/gemm_tcopy_2_rvv.c | 108 +++ kernel/riscv64/gemm_tcopy_4_rvv.c | 236 +++++++ kernel/riscv64/gemm_tcopy_8_rvv.c | 264 ++++++++ kernel/riscv64/gemm_tcopy_rvv_v1.c | 74 +++ kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ++++++ kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ++++++++++++++ kernel/riscv64/gemmkernel_rvv_v1x8.c | 601 +++++++++++++++++ kernel/riscv64/gemv_n_rvv.c | 94 +++ kernel/riscv64/gemv_t_rvv.c | 119 ++++ kernel/riscv64/iamax_rvv.c | 150 +++++ kernel/riscv64/iamin_rvv.c | 151 +++++ kernel/riscv64/imax_rvv.c | 147 +++++ kernel/riscv64/imin_rvv.c | 147 +++++ kernel/riscv64/izamax_rvv.c | 162 +++++ kernel/riscv64/izamin_rvv.c | 161 +++++ kernel/riscv64/max_rvv.c | 98 +++ kernel/riscv64/min_rvv.c | 98 +++ kernel/riscv64/nrm2_rvv.c | 117 ++++ kernel/riscv64/rot_rvv.c | 149 +++++ kernel/riscv64/scal_rvv.c | 80 +++ kernel/riscv64/sum_rvv.c | 95 +++ kernel/riscv64/swap_rvv.c | 142 ++++ kernel/riscv64/symm_lcopy_rvv_v1.c | 101 +++ kernel/riscv64/symm_ucopy_rvv_v1.c | 100 +++ kernel/riscv64/symv_L_rvv.c | 224 +++++++ kernel/riscv64/symv_U_rvv.c | 221 +++++++ kernel/riscv64/trmm_lncopy_rvv_v1.c | 138 ++++ kernel/riscv64/trmm_ltcopy_rvv_v1.c | 134 ++++ kernel/riscv64/trmm_uncopy_rvv_v1.c | 136 ++++ kernel/riscv64/trmm_utcopy_rvv_v1.c | 133 ++++ kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ++++++++++ kernel/riscv64/trmmkernel_4x4_rvv.c | 881 +++++++++++++++++++++++++ kernel/riscv64/trmmkernel_rvv_v1x8.c | 685 +++++++++++++++++++ kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 847 ++++++++++++++++++++++++ kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 840 +++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 792 ++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 828 +++++++++++++++++++++++ kernel/riscv64/trsm_lncopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_ltcopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_uncopy_rvv_v1.c | 121 ++++ kernel/riscv64/trsm_utcopy_rvv_v1.c | 123 ++++ kernel/riscv64/zamax_rvv.c | 113 ++++ kernel/riscv64/zamin_rvv.c | 112 ++++ kernel/riscv64/zasum_rvv.c | 108 +++ kernel/riscv64/zaxpby_rvv.c | 151 +++++ kernel/riscv64/zaxpy_rvv.c | 154 +++++ kernel/riscv64/zcopy_rvv.c | 105 +++ kernel/riscv64/zdot_rvv.c | 170 +++++ kernel/riscv64/zgemm_beta_rvv.c | 117 ++++ kernel/riscv64/zgemv_n_rvv.c | 170 +++++ kernel/riscv64/zgemv_t_rvv.c | 172 +++++ kernel/riscv64/znrm2_rvv.c | 122 ++++ kernel/riscv64/zrot_rvv.c | 181 +++++ kernel/riscv64/zscal_rvv.c | 148 +++++ kernel/riscv64/zsum_rvv.c | 97 +++ kernel/riscv64/zswap_rvv.c | 156 +++++ kernel/riscv64/ztrmmkernel_2x2_rvv.c | 596 +++++++++++++++++ param.h | 44 ++ 80 files changed, 15188 insertions(+) create mode 100644 kernel/riscv64/KERNEL.x280 create mode 100644 kernel/riscv64/amax_rvv.c create mode 100644 kernel/riscv64/amin_rvv.c create mode 100644 kernel/riscv64/asum_rvv.c create mode 100644 kernel/riscv64/axpby_rvv.c create mode 100644 kernel/riscv64/axpy_rvv.c create mode 100644 kernel/riscv64/copy_rvv.c create mode 100644 kernel/riscv64/dot_rvv.c create mode 100644 kernel/riscv64/gemm_beta_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_rvv_v1.c create mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_rvv_v1.c create mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/gemmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/gemv_n_rvv.c create mode 100644 kernel/riscv64/gemv_t_rvv.c create mode 100644 kernel/riscv64/iamax_rvv.c create mode 100644 kernel/riscv64/iamin_rvv.c create mode 100644 kernel/riscv64/imax_rvv.c create mode 100644 kernel/riscv64/imin_rvv.c create mode 100644 kernel/riscv64/izamax_rvv.c create mode 100644 kernel/riscv64/izamin_rvv.c create mode 100644 kernel/riscv64/max_rvv.c create mode 100644 kernel/riscv64/min_rvv.c create mode 100644 kernel/riscv64/nrm2_rvv.c create mode 100644 kernel/riscv64/rot_rvv.c create mode 100644 kernel/riscv64/scal_rvv.c create mode 100644 kernel/riscv64/sum_rvv.c create mode 100644 kernel/riscv64/swap_rvv.c create mode 100644 kernel/riscv64/symm_lcopy_rvv_v1.c create mode 100644 kernel/riscv64/symm_ucopy_rvv_v1.c create mode 100644 kernel/riscv64/symv_L_rvv.c create mode 100644 kernel/riscv64/symv_U_rvv.c create mode 100644 kernel/riscv64/trmm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/trmmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/trsm_kernel_LN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_LT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/zamax_rvv.c create mode 100644 kernel/riscv64/zamin_rvv.c create mode 100644 kernel/riscv64/zasum_rvv.c create mode 100644 kernel/riscv64/zaxpby_rvv.c create mode 100644 kernel/riscv64/zaxpy_rvv.c create mode 100644 kernel/riscv64/zcopy_rvv.c create mode 100644 kernel/riscv64/zdot_rvv.c create mode 100644 kernel/riscv64/zgemm_beta_rvv.c create mode 100644 kernel/riscv64/zgemv_n_rvv.c create mode 100644 kernel/riscv64/zgemv_t_rvv.c create mode 100644 kernel/riscv64/znrm2_rvv.c create mode 100644 kernel/riscv64/zrot_rvv.c create mode 100644 kernel/riscv64/zscal_rvv.c create mode 100644 kernel/riscv64/zsum_rvv.c create mode 100644 kernel/riscv64/zswap_rvv.c create mode 100644 kernel/riscv64/ztrmmkernel_2x2_rvv.c diff --git a/Makefile.install b/Makefile.install index 87b5bc870..f1adaa271 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,6 +8,7 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin +OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -38,6 +39,7 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -202,3 +204,8 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! +#Generating release tar + @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz + @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . + + diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f1274..e6a8eab59 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), x280) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +endif + +ifeq ($(TARGET), RISCV64_GENERIC) +TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce91e03ec..d6eaf552d 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -2,3 +2,11 @@ ifeq ($(CORE), C910V) CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif +ifeq ($(CORE), x280) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +endif +ifeq ($(CORE), RISCV64_GENERIC) +CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +endif \ No newline at end of file diff --git a/README.md b/README.md index 6ce85e08e..6ecb46178 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ``` (also known to work on C906) +- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. + ```sh + make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/TargetList.txt b/TargetList.txt index deef75819..6c533361e 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -120,6 +120,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC C910V +x280 11.LOONGARCH64: LOONGSONGENERIC diff --git a/benchmark/Makefile b/benchmark/Makefile index f2f3b354a..734c83a26 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +# x280 temporary workaround for gfortran +ifeq ($(TARGET), x280) +CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) +endif + + ifneq ($(NO_LAPACK), 1) GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ diff --git a/common_riscv64.h b/common_riscv64.h index 7ddbe80a4..221a79901 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,6 +92,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) +#include +#endif + +#if defined(x280) #include #endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873..5326787e6 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 +#define CPU_x280 2 static char *cpuname[] = { "RISCV64_GENERIC", "C910V" + "x280" }; int detect(void){ diff --git a/getarch.c b/getarch.c index cde5b4e83..0d197285c 100644 --- a/getarch.c +++ b/getarch.c @@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "c910v" #define CORENAME "C910V" #endif +#endif +#ifdef FORCE_x280 +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "x280" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-Dx280 " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "x280" +#define CORENAME "x280" #else #endif diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 new file mode 100644 index 000000000..2eb60f2b4 --- /dev/null +++ b/kernel/riscv64/KERNEL.x280 @@ -0,0 +1,267 @@ +# ********************************************************************************** +# Copyright (c) 2022, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +CTRMMKERNEL = ztrmmkernel_2x2_rvv.c +ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c + +# SGEMM_UNROLL_N set in params.h +ifeq ($(SGEMM_UNROLL_N), 2) +SGEMMKERNEL = gemmkernel_2x2_rvv.c +SGEMMONCOPY = gemm_ncopy_2_rvv.c +SGEMMOTCOPY = gemm_tcopy_2_rvv.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMKERNEL = gemmkernel_4x4_rvv.c +SGEMMONCOPY = gemm_ncopy_4_rvv.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +SGEMMKERNEL = gemmkernel_rvv_v1x8.c +SGEMMINCOPY = gemm_ncopy_rvv_v1.c +SGEMMITCOPY = gemm_tcopy_rvv_v1.c +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMKERNEL = trmmkernel_rvv_v1x8.c + +STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +SSYMMUCOPY_M = symm_ucopy_rvv_v1.c +SSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +# SGEMM_UNROLL_N set in params.h +ifeq ($(DGEMM_UNROLL_N), 2) +DGEMMKERNEL = gemmkernel_2x2_rvv.c +DGEMMONCOPY = gemm_ncopy_2_rvv.c +DGEMMOTCOPY = gemm_tcopy_2_rvv.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMKERNEL = gemmkernel_4x4_rvv.c +DGEMMONCOPY = gemm_ncopy_4_rvv.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +DGEMMKERNEL = gemmkernel_rvv_v1x8.c +DGEMMINCOPY = gemm_ncopy_rvv_v1.c +DGEMMITCOPY = gemm_tcopy_rvv_v1.c +DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = trmmkernel_rvv_v1x8.c +DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +DSYMMUCOPY_M = symm_ucopy_rvv_v1.c +DSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c new file mode 100644 index 000000000..c9c6e7f73 --- /dev/null +++ b/kernel/riscv64/amax_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c new file mode 100644 index 000000000..370b6c338 --- /dev/null +++ b/kernel/riscv64/amin_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c new file mode 100644 index 000000000..4f711c9be --- /dev/null +++ b/kernel/riscv64/asum_rvv.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + asumf = VFMVFS_FLOAT_M1(v_res); + return(asumf); +} diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c new file mode 100644 index 000000000..7c35c563d --- /dev/null +++ b/kernel/riscv64/axpby_rvv.c @@ -0,0 +1,171 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + FLOAT_V_T vx, vy; + + if ( n < 0 ) return(0); + + if ( beta == 0.0 ) { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + memset(&y[0], 0, n * sizeof(FLOAT)); + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + size_t vl = VSETVL(n); + vy = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) { + vl = VSETVL(n); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + + } else { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + } + + return(0); +} diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c new file mode 100644 index 000000000..3986f4e21 --- /dev/null +++ b/kernel/riscv64/axpy_rvv.c @@ -0,0 +1,109 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( n <= 0 ) return(0); + if ( da == 0.0 ) return(0); + + FLOAT_V_T vx, vy; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT(y, vy, vl); + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c new file mode 100644 index 000000000..5d5a8bd04 --- /dev/null +++ b/kernel/riscv64/copy_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1 && inc_y == 1) { + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if(inc_x == 1) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c new file mode 100644 index 000000000..60dcac2f5 --- /dev/null +++ b/kernel/riscv64/dot_rvv.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + double dot = 0.0; + + if ( n <= 0 ) return(dot); + + size_t vlmax = vsetvlmax_e64m8(); + vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } + + vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); + dot = vfmv_f_s_f64m1_f64(vec_sum); + + return(dot); +} diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c new file mode 100644 index 000000000..34d1ea078 --- /dev/null +++ b/kernel/riscv64/gemm_beta_rvv.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#endif + +// Optimizes the implementation in ../generic/gemm_beta.c + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vx; + + if (beta == ZERO) { + + vl = VSETVL(m); + vx = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + vx = VLEV_FLOAT(c_offset, vl); + vx = VFMULVF_FLOAT(vx, beta, vl); + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c new file mode 100644 index 000000000..5f55bc349 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_2_rvv.c @@ -0,0 +1,92 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEG2_FLOAT vsseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEG2_FLOAT vsseg2e64_v_f64m4 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset; + FLOAT_V_T v1, v2; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + + for(j = (n >> 1); j > 0; j--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset, vl); + VSEV_FLOAT(b_offset, v1, vl); + + a_offset += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c new file mode 100644 index 000000000..4d4efe4c9 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_4_rvv.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG4_FLOAT vsseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG4_FLOAT vsseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c new file mode 100644 index 000000000..525b223c2 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_8.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 3); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..2c5230752 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -0,0 +1,76 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda; + + for(i = m; i > 0; i--) { + v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(b_offset, v0, vl); + + a_offset1++; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c new file mode 100644 index 000000000..963e1be69 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_2_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT_V_T v1a, v1b, v2a, v2b; + size_t vl; + + //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + for(i = (m >> 1); i > 0; i--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); + VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); + + VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); + + a_offset1 += vl * 2; + a_offset2 += vl * 2; + b_offset1 += vl * m * 2; + } + + if (n & 1) { + *(b_offset2 + 0) = *(a_offset1 + 0); + *(b_offset2 + 1) = *(a_offset2 + 0); + b_offset2 += 2; + } + } + + if (m & 1) { + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); + + VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); + + a_offset += vl * 2; + b_offset += vl * m * 2; + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c new file mode 100644 index 000000000..ac9974b24 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_4_rvv.c @@ -0,0 +1,236 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + for(i = (n >> 2); i > 0; i--) { + v1 = VLEV_FLOAT(a_offset1, 4); + v2 = VLEV_FLOAT(a_offset2, 4); + v3 = VLEV_FLOAT(a_offset3, 4); + v4 = VLEV_FLOAT(a_offset4, 4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + VSEV_FLOAT(b_offset1, v1, 4); + VSEV_FLOAT(b_offset2+4, v2, 4); + VSEV_FLOAT(b_offset2+8, v3, 4); + VSEV_FLOAT(b_offset2+12, v4, 4); + + b_offset1 += m * 4; + } + + if (n & 2) { + v1 = VLEV_FLOAT(a_offset1, 2); + v2 = VLEV_FLOAT(a_offset2, 2); + v3 = VLEV_FLOAT(a_offset3, 2); + v4 = VLEV_FLOAT(a_offset4, 2); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + VSEV_FLOAT(b_offset2, v1, 2); + VSEV_FLOAT(b_offset2+2, v2, 2); + VSEV_FLOAT(b_offset2+4, v3, 2); + VSEV_FLOAT(b_offset2+6, v4, 2); + + b_offset2 += 8; + } + + if (n & 1) { + v1 = VLEV_FLOAT(a_offset1, 1); + v2 = VLEV_FLOAT(a_offset2, 1); + v3 = VLEV_FLOAT(a_offset3, 1); + v4 = VLEV_FLOAT(a_offset4, 1); + + VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); + + b_offset3 += 4; + } + + } + +// TODO cleanup + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c new file mode 100644 index 000000000..81c1f962b --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -0,0 +1,264 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEV_FLOAT vlse32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEV_FLOAT vlse64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + + // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + for(j = (m >> 3); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 8; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 32; + } + + if (n & 2) { + size_t vl = 8; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 16; + } + + if (n & 1) { + size_t vl = 8; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 8; + } + + } + + if (m & 4) { + + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 4; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 4; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 2) { + size_t vl = 4; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 8; + } + + if (n & 1) { + size_t vl = 4; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 4; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 2; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 2; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 2) { + size_t vl = 2; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 4; + } + + if (n & 1) { + size_t vl = 2; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 2; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 4) { + size_t vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 2) { + size_t vl = 2; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset3, v0, vl); + + aoffset1 += 2; + // boffset3 += 2; + } + + if (n & 1) { + *(boffset4) = *(aoffset1); + // aoffset1 ++; + // boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..a291b70b8 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl; + + for(i = m; i > 0; i--) { + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset, v0, vl); + + aoffset1 += lda; + boffset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c new file mode 100644 index 000000000..ec8961ced --- /dev/null +++ b/kernel/riscv64/gemmkernel_2x2_rvv.c @@ -0,0 +1,214 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG2_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG2_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/2; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c new file mode 100644 index 000000000..aa58bcc76 --- /dev/null +++ b/kernel/riscv64/gemmkernel_4x4_rvv.c @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEG2_FLOAT vlseg2e32_v_f32m1 +#define VLSEG4_FLOAT vlseg4e32_v_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m1 +#define VFMACCVF_FLOAT vfmacc_vf_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEG2_FLOAT vlseg2e64_v_f64m1 +#define VLSEG4_FLOAT vlseg4e64_v_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m1 +#define VFMACCVF_FLOAT vfmacc_vf_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU + + FLOAT_V_T va0, va1, va2, va3; + FLOAT_V_T vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/4; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + vres8 = VFMVVF_FLOAT(0.0, vlmax); + vres9 = VFMVVF_FLOAT(0.0, vlmax); + vres10 = VFMVVF_FLOAT(0.0, vlmax); + vres11 = VFMVVF_FLOAT(0.0, vlmax); + vres12 = VFMVVF_FLOAT(0.0, vlmax); + vres13 = VFMVVF_FLOAT(0.0, vlmax); + vres14 = VFMVVF_FLOAT(0.0, vlmax); + vres15 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); + vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); + vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); + vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); + + vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); + vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); + vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); + vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); + + ptrba += vl*4; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + C2 += 4; + C3 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + ptrba += vl*2; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + C1 += 2; + C2 += 2; + C3 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 1; + C1 += 1; + C2 += 1; + C3 += 1; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + if(bn & 2) { + + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); + vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); + + ptrba += vl*4; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + ptrba += vl*4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c new file mode 100644 index 000000000..5cd509f93 --- /dev/null +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -0,0 +1,601 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#endif + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + // N:8 + for (j = bn/8; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + C4 = C3 + ldc; + C5 = C4 + ldc; + C6 = C5 + ldc; + C7 = C6 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrba += vl; + ptrbb += 8; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VLEV_FLOAT(C4, vl); + va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VLEV_FLOAT(C5, vl); + va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VLEV_FLOAT(C6, vl); + va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VLEV_FLOAT(C7, vl); + va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); + VSEV_FLOAT(C7, va7, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + + bb += (bk<<3); + C += (ldc<<3); + } + + // N:4 + if (bn & 4) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrba += vl; + ptrbb += 4; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + // N:2 + if (bn & 2) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrba += vl; + ptrbb += 2; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + C0 += vl; + C1 += vl; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + // N:1 + if (bn & 1) { + C0 = C; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrba += vl; + ptrbb += 1; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + C0 += vl; + } + + bb += (bk); + C += (ldc); + } + + return 0; +} diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c new file mode 100644 index 000000000..9d2dee615 --- /dev/null +++ b/kernel/riscv64/gemv_n_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + if(n < 0) return(0); + + FLOAT *a_ptr, *x_ptr; + BLASLONG i; + FLOAT_V_T va, vy; + + if(inc_y == 1) { + + for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLEV_FLOAT(y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSEV_FLOAT(y, vy, vl); + } + + } else { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLSEV_FLOAT(y, stride_y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + return(0); +} diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c new file mode 100644 index 000000000..a80af81b6 --- /dev/null +++ b/kernel/riscv64/gemv_t_rvv.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT *a_ptr, *x_ptr; + + FLOAT_V_T va, vx, vr; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + if(inc_x == 1) { + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLEV_FLOAT(x_ptr, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLSEV_FLOAT(x_ptr, stride_x, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } + + return(0); +} diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c new file mode 100644 index 000000000..8b33b3bcb --- /dev/null +++ b/kernel/riscv64/iamax_rvv.c @@ -0,0 +1,150 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c new file mode 100644 index 000000000..585b37186 --- /dev/null +++ b/kernel/riscv64/iamin_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c new file mode 100644 index 000000000..d84ad968e --- /dev/null +++ b/kernel/riscv64/imax_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_min; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c new file mode 100644 index 000000000..fb734f6f8 --- /dev/null +++ b/kernel/riscv64/imin_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c new file mode 100644 index 000000000..9cb332cbb --- /dev/null +++ b/kernel/riscv64/izamax_rvv.c @@ -0,0 +1,162 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + } + else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + + } + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c new file mode 100644 index 000000000..69771e5aa --- /dev/null +++ b/kernel/riscv64/izamin_rvv.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c new file mode 100644 index 000000000..5b1380d2b --- /dev/null +++ b/kernel/riscv64/max_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c new file mode 100644 index 000000000..bddcc0ba7 --- /dev/null +++ b/kernel/riscv64/min_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c new file mode 100644 index 000000000..3f5d50397 --- /dev/null +++ b/kernel/riscv64/nrm2_rvv.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define ABS fabsf +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define ABS fabs +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if( n <= 0 ) return(0.0); + if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl * inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c new file mode 100644 index 000000000..7bf5e4270 --- /dev/null +++ b/kernel/riscv64/rot_rvv.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if(n <= 0) return(0); + + FLOAT_V_T v0, v1, vx, vy; + + if (inc_x == 0 || inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_y == 1) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_x == 1) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c new file mode 100644 index 000000000..d2c0378bf --- /dev/null +++ b/kernel/riscv64/scal_rvv.c @@ -0,0 +1,80 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( (n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + + } + + return 0; +} diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c new file mode 100644 index 000000000..1db0d09dd --- /dev/null +++ b/kernel/riscv64/sum_rvv.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + sumf = VFMVFS_FLOAT_M1(v_res); + return(sumf); +} diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c new file mode 100644 index 000000000..2cf92f6ad --- /dev/null +++ b/kernel/riscv64/swap_rvv.c @@ -0,0 +1,142 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG stride_x, stride_y; + FLOAT_V_T vx, vy; + + if (n <= 0) return(0); + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp = x[0]; + x[0] = y[0]; + y[0] = temp; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp = x[0]; + x[0] = y[(n - 1) * inc_y]; + FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one + stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) { + vl = VSETVL(m); + vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); + VSSEV_FLOAT(ptr, stride_y, vy, vl); + } + y[0] = temp; + } + else if(inc_y == 0) { + FLOAT temp = y[0]; + y[0] = x[(n - 1) * inc_x]; + FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one + stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { + vl = VSETVL(m); + vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); + VSSEV_FLOAT(ptr, stride_x, vx, vl); + } + x[0] = temp; + } + else if(inc_x == 1 && inc_y == 1) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else if (inc_y == 1) { + stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + + } else if(inc_x == 1) { + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c new file mode 100644 index 000000000..f0def9617 --- /dev/null +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -0,0 +1,101 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_lcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX + posY * lda; + ao2 = a + posY + (posX) * lda; + + for (i = m; i > 0; i--, offset--) { + va2 = VLSEV_FLOAT(ao2, stride_lda, vl); + va1 = VLEV_FLOAT(ao1, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1 += lda; + ao2++; + } + } + + return 0; +} + diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c new file mode 100644 index 000000000..958506df3 --- /dev/null +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_ucopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posX + 0 + posY * lda; + + for (i = m; i > 0; i--, offset--) { + va1 = VLSEV_FLOAT(ao1, stride_lda, vl); + va2 = VLEV_FLOAT(ao2, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1++; + ao2 += lda; + } + } + + return 0; +} diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c new file mode 100644 index 000000000..737abaae3 --- /dev/null +++ b/kernel/riscv64/symv_L_rvv.c @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + if(inc_x == 1 && inc_y == 1) + { + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jy += inc_y; + a_ptr += lda; + } + } + else if(inc_y == 1) + { + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + inc_xv = inc_x * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + a_ptr += lda; + } + } + else + { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c new file mode 100644 index 000000000..cb923be5d --- /dev/null +++ b/kernel/riscv64/symv_U_rvv.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + FLOAT_V_T_M1 v_res, v_z0; + size_t vl_max = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vl_max); + v_z0 = VFMVVF_FLOAT_M1(0, vl_max); + vl_max = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + if(inc_x == 1 && inc_y == 1) + { + a_ptr += m1 * lda; + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jy += inc_y; + } + } + else if(inc_y == 1) + { + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + vr = VFMVVF_FLOAT(0, vl); + + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + } + } + else + { + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c new file mode 100644 index 000000000..73a8233f8 --- /dev/null +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT vid_v_i32m2 +#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..2fe8cf79e --- /dev/null +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..b64cd840d --- /dev/null +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda; + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X > posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..b96daae5b --- /dev/null +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -0,0 +1,133 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T vb, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + ao ++; + b += vl; + X ++; + i++; + } + else if (X > posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + ao += lda; + b += vl; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c new file mode 100644 index 000000000..127e76970 --- /dev/null +++ b/kernel/riscv64/trmmkernel_2x2_rvv.c @@ -0,0 +1,342 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_2x2.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 v_res, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); + C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+2; + C1 = C1+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + C1 = C1+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off; +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + vb0 = VLEV_FLOAT(ptrbb, vl); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); + + ptrba += vl * 2; + ptrbb += vl; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp; +#endif +#ifdef LEFT + off += 2; +#endif + + C0 = C0+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + ptrba += vl; + ptrbb += vl; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp; + ptrbb += temp; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c new file mode 100644 index 000000000..3e46c6348 --- /dev/null +++ b/kernel/riscv64/trmmkernel_4x4_rvv.c @@ -0,0 +1,881 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMUL_FLOAT vfmul_vv_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMUL_FLOAT vfmul_vv_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_4x4.c + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + + FLOAT_V_T vres0_0; + FLOAT_V_T vres0_1; + FLOAT_V_T vres0_2; + FLOAT_V_T vres0_3; + + FLOAT_V_T vres1_0; + FLOAT_V_T vres1_1; + FLOAT_V_T vres1_2; + FLOAT_V_T vres1_3; + + FLOAT_V_T vres2_0; + FLOAT_V_T vres2_1; + FLOAT_V_T vres2_2; + FLOAT_V_T vres2_3; + + FLOAT_V_T vres3_0; + FLOAT_V_T vres3_1; + FLOAT_V_T vres3_2; + FLOAT_V_T vres3_3; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); + vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); + vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); + + ptrba += vl * 4; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + if (!backwards) { + temp = bk-off; + temp = left ? temp - 4 : // number of values in A + temp - 4; // number of values in B + + ptrba += temp*4; // number of values in A + ptrbb += temp*4; // number of values in B + } +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + C2 = C2+4; + C3 = C3+4; + + } + + if ( bm & 2 ) // do any 2x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres2_1 = VFMVVF_FLOAT(0, vlmax); + + vres3_0 = VFMVVF_FLOAT(0, vlmax); + vres3_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+4; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + ptrba += vl * 2; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); + + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + C2 = C2+2; + C3 = C3+2; + + } + + if ( bm & 1 ) // do any 1x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres3_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + C2 = C2+1; + C3 = C3+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + for (j=0; j<(bn&2); j+=2) // do the Mx2 loops + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + + ptrba += vl * 4; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + + } + + if ( bm & 2 ) // do any 2x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + + } + + if ( bm & 1 ) // do any 1x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*2; +#endif + + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + for (j=0; j<(bn&1); j+=1) // do the Mx1 loops + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + + ptrba += vl * 4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + + } + + if ( bm & 2 ) // do any 2x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + ptrba += vl * 2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + + } + + if ( bm & 1 ) // do any 1x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c new file mode 100644 index 000000000..97b14650c --- /dev/null +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -0,0 +1,685 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_8x8.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc); + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + for (j = bn/8; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + C4 = C3+ldc; + C5 = C4+ldc; + C6 = C5+ldc; + C7 = C6+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*8; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+8; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VFMULVF_FLOAT(vres4, alpha, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VFMULVF_FLOAT(vres5, alpha, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VFMULVF_FLOAT(vres6, alpha, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VFMULVF_FLOAT(vres7, alpha, vl); + VSEV_FLOAT(C7, va7, vl); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 8; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*8; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; +#endif + + bb += (bk<<3); + C += (ldc<<3); + } + + if (bn & 4) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*4; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk<<2); + C += (ldc<<2); + } + + if (bn & 2) + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + bb += (bk<<1); + C += (ldc<<1); + } + + if (bn & 1) + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*1; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + bb += (bk); + C += (ldc); + } + return 0; +} + diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c new file mode 100644 index 000000000..11a0398ca --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -0,0 +1,847 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + pa = a; + pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc0, vc1; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + //bb = *(c + i + j * ldc); + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + //bb *= aa; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + //*b = bb; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + + //*(c + i + j * ldc) = bb; + //b ++; + + b += 2; + //pa = a + i + 1; + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + pa = a; + //pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + + b += 8; + + pc0 = c + j * ldc * 8; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#else +static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * j * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c new file mode 100644 index 000000000..0380bd1bb --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c + +#ifndef COMPLEX +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc; + for (i = 0; i < m; i++) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b++; + pa = a + i + 1; + pc = c + j * ldc + i + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc0, vc1; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + b += 8; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + + +static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + FLOAT *pa, *pc; + + int i, j, k; + + size_t vl; + FLOAT_V_T va0, va1, vc0, vc1; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + pa = a + (i + 1) * 2; + pc = c + j * ldc + (i + 1) * 2; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, pa, vl); + VLSEG2_FLOAT(&vc0, &vc1, pc, vl); +#ifndef CONJ + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#else + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); + vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#endif + VSSEG2_FLOAT(pc, vc0, vc1, vl); + pa += vl * 2; + pc += vl * 2; + } + } + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c new file mode 100644 index 000000000..41368be60 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -0,0 +1,792 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + + for (j = 0; j < m; j ++) + { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b + i + 1; + pc = c + j + (i + 1) *ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc ++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + + a += 8; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + pc4 = pa4 + ldc; + pc5 = pa5 + ldc; + pc6 = pa6 + ldc; + pc7 = pa7 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} +#else +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = vl; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c new file mode 100644 index 000000000..459c1663a --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -0,0 +1,828 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b; + pc = c + j; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc++; + } + } + b -= n; + a -= 2 * m; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 2; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc + j * 2; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + a += 8; + + pb = b; + pc0 = c + j * 8; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + pc4 = pc3 + 1; + pc5 = pc4 + 1; + pc6 = pc5 + 1; + pc7 = pc6 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - j) * vl * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..bacfb2b08 --- /dev/null +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 + +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + *(b + j) = INV(*(ao + j * lda)); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..0fc7c9f24 --- /dev/null +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j)); + + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + b += vl; + ao += lda; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..ee869a795 --- /dev/null +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j * lda)); + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..a324b0fa6 --- /dev/null +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + *(b + j) = INV(*(ao + j)); + + ao += lda; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c new file mode 100644 index 000000000..1917042be --- /dev/null +++ b/kernel/riscv64/zamax_rvv.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T v0, v1, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c new file mode 100644 index 000000000..3f027383a --- /dev/null +++ b/kernel/riscv64/zamin_rvv.c @@ -0,0 +1,112 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T v0, v1, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c new file mode 100644 index 000000000..7876646b3 --- /dev/null +++ b/kernel/riscv64/zasum_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v1 = VLEV_FLOAT(x+vl, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + else { + + int stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v1 = VLSEV_FLOAT(x+1, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + asumf += VFMVFS_FLOAT_M1(v_res); + + return(asumf); +} diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c new file mode 100644 index 000000000..66f52d9d0 --- /dev/null +++ b/kernel/riscv64/zaxpby_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/*************************************************************************** +* 2014/06/07 Saar +* +***************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFMSACVF_FLOAT vfmsac_vf_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFMSACVF_FLOAT vfmsac_vf_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) +{ + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + BLASLONG stride_x = inc_x2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y2 * sizeof(FLOAT); + FLOAT_V_T vx0, vx1, vy0, vy1; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + size_t vl = VSETVL(n); + FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) + { + vl = VSETVL(n); + VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); + vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); + + vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); + + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + } + else + { + FLOAT_V_T v0, v1; + + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vy1, beta_i, vl); + v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); + + v1 = VFMULVF_FLOAT(vy1, beta_r, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); + v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); + v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); + v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); + + v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); + v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + } + return(0); + +} diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c new file mode 100644 index 000000000..777bcb728 --- /dev/null +++ b/kernel/riscv64/zaxpy_rvv.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if(n < 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1) { + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c new file mode 100644 index 000000000..5d8322bbb --- /dev/null +++ b/kernel/riscv64/zcopy_rvv.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_M8(n) vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 vle32_v_f32m8 +#define VSEV_FLOAT_M8 vse32_v_f32m8 + +#define VSETVL_M4(n) vsetvl_e32m4(n) +#define FLOAT_V_T_M4 vfloat32m4_t +#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 +#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 +#else +#define VSETVL_M8(n) vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 vle64_v_f64m8 +#define VSEV_FLOAT_M8 vse64_v_f64m8 + +#define VSETVL_M4(n) vsetvl_e64m4(n) +#define FLOAT_V_T_M4 vfloat64m4_t +#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 +#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + if(inc_x == 1 && inc_y == 1) { + + FLOAT_V_T_M8 vx; + n *= 2; // convert to words + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL_M8(n); + vx = VLEV_FLOAT_M8(x, vl); + VSEV_FLOAT_M8(y, vx, vl); + } + + }else if (1 == inc_x) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSEG_FLOAT_M4(&vr, &vi, x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } else if (1 == inc_y) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSEG_FLOAT_M4(y, vr, vi, vl); + } + } else { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c new file mode 100644 index 000000000..7eae6f608 --- /dev/null +++ b/kernel/riscv64/zdot_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMSACVV_FLOAT vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMSACVV_FLOAT vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + OPENBLAS_COMPLEX_FLOAT result; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + + if ( n <= 0 ) return(result); + + FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax_m1 = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); + + size_t vlmax = VSETVL_MAX; + vr0 = VFMVVF_FLOAT(0, vlmax); + vr1 = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + + } else if (inc_x == 1){ + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } else if (inc_y == 1){ + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + }else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } + + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); + CREAL(result) = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); + CIMAG(result) = VFMVFS_FLOAT_M1(v_res); + + return(result); +} diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c new file mode 100644 index 000000000..a89752d18 --- /dev/null +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFSUBVV_FLOAT vfsub_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFSUBVV_FLOAT vfsub_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vr, vi, v1, v2, v3, v4; + + ldc *= 2; + c_offset = c; + + if (beta_r == 0.0 && beta_i == 0.0) { + + vl = VSETVL(m); + vr = VFMVVF_FLOAT(0.0, vl); + vi = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VLSEG_FLOAT(&vr, &vi, c_offset, vl); + + v1 = VFMULVF_FLOAT(vr, beta_r, vl); + v2 = VFMULVF_FLOAT(vi, beta_i, vl); + + v3 = VFMULVF_FLOAT(vi, beta_r, vl); + v4 = VFMULVF_FLOAT(vr, beta_i, vl); + + vr = VFSUBVV_FLOAT(v1, v2, vl); + vi = VFADDVV_FLOAT(v3, v4, vl); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c new file mode 100644 index 000000000..2eeb61b45 --- /dev/null +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix; + FLOAT *a_ptr; + FLOAT temp_r, temp_i; + FLOAT_V_T va0, va1, vy0, vy1; + + BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; + + BLASLONG inc_x2 = inc_x * 2; + BLASLONG lda2 = lda * 2; + if (inc_y == 1) + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } + else + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + return(0); +} diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c new file mode 100644 index 000000000..b682d5cd8 --- /dev/null +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; + FLOAT temp_r, temp_i; + + FLOAT_V_T va0, va1, vx0, vx1, vr, vi; + FLOAT_V_T_M1 v_res, v_z0; + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + //BLASLONG stride_a = sizeof(FLOAT) * 2; + BLASLONG inc_y2 = inc_y * 2; + BLASLONG lda2 = lda * 2; + + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL(m); + + if (inc_x == 1) + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + } + else + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + + } + + + return(0); +} diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c new file mode 100644 index 000000000..921ddb8cb --- /dev/null +++ b/kernel/riscv64/znrm2_rvv.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m4 +#endif + +// TODO: Should single precision use the widening MAC, or perhaps all should be double? + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if ( n <= 0 ) return(0.0); + + FLOAT_V_T vr, v0, v1; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if (inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c new file mode 100644 index 000000000..68066a00b --- /dev/null +++ b/kernel/riscv64/zrot_rvv.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + + return 0; +} diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c new file mode 100644 index 000000000..079c36a2d --- /dev/null +++ b/kernel/riscv64/zscal_rvv.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if((n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T vt, vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + size_t vlmax = VSETVL_MAX; + + if(da_r == 0.0 && da_i == 0.0) { + + vr = VFMVVF_FLOAT(0.0, vlmax); + vi = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VSSEG_FLOAT(x, vr, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + } + + } else if(da_r == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vi, -da_i, vl); + vi = VFMULVF_FLOAT(vr, da_i, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + + } else if(da_i == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vr = VFMULVF_FLOAT(vr, da_r, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + + } else { + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vr, &vi, x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSEG_FLOAT(x, vt, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + } + } + + return(0); +} diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c new file mode 100644 index 000000000..3928fbe27 --- /dev/null +++ b/kernel/riscv64/zsum_rvv.c @@ -0,0 +1,97 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + sumf += VFMVFS_FLOAT_M1(v_res); + + return(sumf); +} diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c new file mode 100644 index 000000000..86f9103d3 --- /dev/null +++ b/kernel/riscv64/zswap_rvv.c @@ -0,0 +1,156 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[0]; + x[1] = y[1]; + y[0] = temp[0]; + y[1] = temp[1]; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[(n - 1) * inc_y * 2]; + x[0] = y[(n - 1) * inc_y * 2 + 1]; + FLOAT* ptr = y + (n - 1) * inc_y * 2; // start from the last one + BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + } + y[0] = temp[0]; + y[1] = temp[1]; + } + else if(inc_y == 0) { + FLOAT temp[2]; + temp[0] = y[0]; + temp[1] = y[1]; + y[0] = x[(n - 1) * inc_x * 2]; + y[0] = x[(n - 1) * inc_x * 2 + 1]; + FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one + BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + } + x[0] = temp[0]; + x[1] = temp[1]; + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c new file mode 100644 index 000000000..3486a4648 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -0,0 +1,596 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c + + +/******************************** + ADD1 a*c + ADD2 b*c + ADD3 a*d + ADD4 b*d + *********************************/ +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, + FLOAT* C,BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T_M1 v_m1_res0, v_m1_res1; + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + C0 = C; + C1 = C0+2*ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 2; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 4; + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[2] = res0 * alphar - res1 * alphai; + C1[3] = res1 * alphar + res0 * alphai; +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + + ptrba += temp*2*2; + ptrbb += temp*2*2; + +#endif + +#ifdef LEFT + off += 2; +#endif + + C0 = C0+4; + C1 = C1+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 4; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + C1 = C1+2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 2; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 2; + + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp*2; + +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + } + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + return 0; +} diff --git a/param.h b/param.h index 514b13a3a..62b675d6c 100644 --- a/param.h +++ b/param.h @@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(x280) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2 +#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2 + +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If VLMAX size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8 +#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef C910V #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 From 5d0d1c555195a391fe5d029427dfbf7b942ecdf9 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 18:22:21 -0800 Subject: [PATCH 2/5] Remove redundant files --- Makefile.install | 5 - kernel/riscv64/KERNEL.x280 | 36 +- kernel/riscv64/gemm_ncopy_2_rvv.c | 92 --- kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ---- kernel/riscv64/gemm_tcopy_2_rvv.c | 108 ---- kernel/riscv64/gemm_tcopy_4_rvv.c | 236 -------- kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ------- kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ---------------- kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ----------- kernel/riscv64/trmmkernel_4x4_rvv.c | 881 ---------------------------- 10 files changed, 2 insertions(+), 2543 deletions(-) delete mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c diff --git a/Makefile.install b/Makefile.install index f1adaa271..168d08f72 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin -OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -39,7 +38,6 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" - @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -204,8 +202,5 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! -#Generating release tar - @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz - @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 2eb60f2b4..4d64354fb 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -122,23 +122,7 @@ CTRMMKERNEL = ztrmmkernel_2x2_rvv.c ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c # SGEMM_UNROLL_N set in params.h -ifeq ($(SGEMM_UNROLL_N), 2) -SGEMMKERNEL = gemmkernel_2x2_rvv.c -SGEMMONCOPY = gemm_ncopy_2_rvv.c -SGEMMOTCOPY = gemm_tcopy_2_rvv.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMKERNEL = gemmkernel_4x4_rvv.c -SGEMMONCOPY = gemm_ncopy_4_rvv.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 8) +ifeq ($(SGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX SGEMMKERNEL = gemmkernel_rvv_v1x8.c SGEMMINCOPY = gemm_ncopy_rvv_v1.c @@ -162,23 +146,7 @@ SSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif # SGEMM_UNROLL_N set in params.h -ifeq ($(DGEMM_UNROLL_N), 2) -DGEMMKERNEL = gemmkernel_2x2_rvv.c -DGEMMONCOPY = gemm_ncopy_2_rvv.c -DGEMMOTCOPY = gemm_tcopy_2_rvv.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMKERNEL = gemmkernel_4x4_rvv.c -DGEMMONCOPY = gemm_ncopy_4_rvv.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 8) +ifeq ($(DGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX DGEMMKERNEL = gemmkernel_rvv_v1x8.c DGEMMINCOPY = gemm_ncopy_rvv_v1.c diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c deleted file mode 100644 index 5f55bc349..000000000 --- a/kernel/riscv64/gemm_ncopy_2_rvv.c +++ /dev/null @@ -1,92 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEG2_FLOAT vsseg2e32_v_f32m4 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEG2_FLOAT vsseg2e64_v_f64m4 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - FLOAT_V_T v1, v2; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - - for(j = (n >> 1); j > 0; j--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset, vl); - VSEV_FLOAT(b_offset, v1, vl); - - a_offset += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c deleted file mode 100644 index 4d4efe4c9..000000000 --- a/kernel/riscv64/gemm_ncopy_4_rvv.c +++ /dev/null @@ -1,123 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG4_FLOAT vsseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG4_FLOAT vsseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset; - - FLOAT_V_T v1, v2, v3, v4; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - for(j = (n >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - v3 = VLEV_FLOAT(a_offset3, vl); - v4 = VLEV_FLOAT(a_offset4, vl); - - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); - - a_offset1 += vl; - a_offset2 += vl; - a_offset3 += vl; - a_offset4 += vl; - b_offset += vl*4; - } - } - - if (n & 2) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - a_offset1 = a_offset; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - - VSEV_FLOAT(b_offset, v1, vl); - - a_offset1 += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c deleted file mode 100644 index 963e1be69..000000000 --- a/kernel/riscv64/gemm_tcopy_2_rvv.c +++ /dev/null @@ -1,108 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset, *b_offset1, *b_offset2; - FLOAT_V_T v1a, v1b, v2a, v2b; - size_t vl; - - //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - b_offset2 = b + m * (n & ~1); - - for(i = (m >> 1); i > 0; i--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 4; - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); - VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); - - VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); - - a_offset1 += vl * 2; - a_offset2 += vl * 2; - b_offset1 += vl * m * 2; - } - - if (n & 1) { - *(b_offset2 + 0) = *(a_offset1 + 0); - *(b_offset2 + 1) = *(a_offset2 + 0); - b_offset2 += 2; - } - } - - if (m & 1) { - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); - - VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); - - a_offset += vl * 2; - b_offset += vl * m * 2; - } - - if (n & 1){ - *(b_offset2 + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c deleted file mode 100644 index ac9974b24..000000000 --- a/kernel/riscv64/gemm_tcopy_4_rvv.c +++ /dev/null @@ -1,236 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; - FLOAT ctemp1, ctemp2, ctemp3, ctemp4; - FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - - //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - b_offset2 = b + m * (n & ~3); - b_offset3 = b + m * (n & ~1); - - for(j = (m >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - b_offset1 = b_offset; - b_offset += 16; - - for(i = (n >> 2); i > 0; i--) { - v1 = VLEV_FLOAT(a_offset1, 4); - v2 = VLEV_FLOAT(a_offset2, 4); - v3 = VLEV_FLOAT(a_offset3, 4); - v4 = VLEV_FLOAT(a_offset4, 4); - - a_offset1 += 4; - a_offset2 += 4; - a_offset3 += 4; - a_offset4 += 4; - - VSEV_FLOAT(b_offset1, v1, 4); - VSEV_FLOAT(b_offset2+4, v2, 4); - VSEV_FLOAT(b_offset2+8, v3, 4); - VSEV_FLOAT(b_offset2+12, v4, 4); - - b_offset1 += m * 4; - } - - if (n & 2) { - v1 = VLEV_FLOAT(a_offset1, 2); - v2 = VLEV_FLOAT(a_offset2, 2); - v3 = VLEV_FLOAT(a_offset3, 2); - v4 = VLEV_FLOAT(a_offset4, 2); - - a_offset1 += 2; - a_offset2 += 2; - a_offset3 += 2; - a_offset4 += 2; - - VSEV_FLOAT(b_offset2, v1, 2); - VSEV_FLOAT(b_offset2+2, v2, 2); - VSEV_FLOAT(b_offset2+4, v3, 2); - VSEV_FLOAT(b_offset2+6, v4, 2); - - b_offset2 += 8; - } - - if (n & 1) { - v1 = VLEV_FLOAT(a_offset1, 1); - v2 = VLEV_FLOAT(a_offset2, 1); - v3 = VLEV_FLOAT(a_offset3, 1); - v4 = VLEV_FLOAT(a_offset4, 1); - - VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); - - b_offset3 += 4; - } - - } - -// TODO cleanup - - if (m & 2){ - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 8; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - ctemp5 = *(a_offset2 + 0); - ctemp6 = *(a_offset2 + 1); - ctemp7 = *(a_offset2 + 2); - ctemp8 = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - *(b_offset1 + 4) = ctemp5; - *(b_offset1 + 5) = ctemp6; - *(b_offset1 + 6) = ctemp7; - *(b_offset1 + 7) = ctemp8; - - b_offset1 += m * 4; - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - - ctemp3 = *(a_offset2 + 0); - ctemp4 = *(a_offset2 + 1); - - a_offset1 += 2; - a_offset2 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - *(b_offset2 + 2) = ctemp3; - *(b_offset2 + 3) = ctemp4; - - b_offset2 += 4; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset2 + 0); - - *(b_offset3 + 0) = ctemp1; - *(b_offset3 + 1) = ctemp2; - b_offset3 += 2; - } - } - - if (m & 1){ - a_offset1 = a_offset; - b_offset1 = b_offset; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - a_offset1 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - b_offset1 += 4 * m; - - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - a_offset1 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - *(b_offset3 + 0) = ctemp1; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c deleted file mode 100644 index ec8961ced..000000000 --- a/kernel/riscv64/gemmkernel_2x2_rvv.c +++ /dev/null @@ -1,214 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG2_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG2_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/2; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c deleted file mode 100644 index aa58bcc76..000000000 --- a/kernel/riscv64/gemmkernel_4x4_rvv.c +++ /dev/null @@ -1,508 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VLSEG4_FLOAT vlseg4e32_v_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m1 -#define VFMACCVF_FLOAT vfmacc_vf_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m1(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VLSEG4_FLOAT vlseg4e64_v_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m1 -#define VFMACCVF_FLOAT vfmacc_vf_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU - - FLOAT_V_T va0, va1, va2, va3; - FLOAT_V_T vb0, vb1, vb2, vb3; - FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; - FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/4; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - C2 = C1 + ldc; - C3 = C2 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - vres8 = VFMVVF_FLOAT(0.0, vlmax); - vres9 = VFMVVF_FLOAT(0.0, vlmax); - vres10 = VFMVVF_FLOAT(0.0, vlmax); - vres11 = VFMVVF_FLOAT(0.0, vlmax); - vres12 = VFMVVF_FLOAT(0.0, vlmax); - vres13 = VFMVVF_FLOAT(0.0, vlmax); - vres14 = VFMVVF_FLOAT(0.0, vlmax); - vres15 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); - vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); - vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); - vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); - - vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); - vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); - vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); - vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); - - ptrba += vl*4; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - C2 += 4; - C3 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - ptrba += vl*2; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - C1 += 2; - C2 += 2; - C3 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); - vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 1; - C1 += 1; - C2 += 1; - C3 += 1; - } - - bb += (bk<<2); - C += (ldc<<2); - } - - if(bn & 2) { - - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); - vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); - vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); - - ptrba += vl*4; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - ptrba += vl*4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c deleted file mode 100644 index 127e76970..000000000 --- a/kernel/riscv64/trmmkernel_2x2_rvv.c +++ /dev/null @@ -1,342 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_2x2.c - - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - BLASLONG off, temp; - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 v_res, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - -#if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; -#else - off = 0; -#endif - - for (j = bn/2; j > 0; j--) - { - C0 = C; - C1 = C0+ldc; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); - C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 2; -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 2; -#endif - C0 = C0+2; - C1 = C1+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 2; -#endif - ptrba += temp; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - C1 = C1+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - if (bn & 1) - { - C0 = C; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off; -#endif - - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - vb0 = VLEV_FLOAT(ptrbb, vl); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); - - ptrba += vl * 2; - ptrbb += vl; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 1; -#endif - ptrba += temp*2; - ptrbb += temp; -#endif -#ifdef LEFT - off += 2; -#endif - - C0 = C0+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off + 1; -#else - temp = off + 1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - ptrba += vl; - ptrbb += vl; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 1; -#endif - ptrba += temp; - ptrbb += temp; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} - diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c deleted file mode 100644 index 3e46c6348..000000000 --- a/kernel/riscv64/trmmkernel_4x4_rvv.c +++ /dev/null @@ -1,881 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMUL_FLOAT vfmul_vv_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMUL_FLOAT vfmul_vv_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_4x4.c - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - - FLOAT_V_T vres0_0; - FLOAT_V_T vres0_1; - FLOAT_V_T vres0_2; - FLOAT_V_T vres0_3; - - FLOAT_V_T vres1_0; - FLOAT_V_T vres1_1; - FLOAT_V_T vres1_2; - FLOAT_V_T vres1_3; - - FLOAT_V_T vres2_0; - FLOAT_V_T vres2_1; - FLOAT_V_T vres2_2; - FLOAT_V_T vres2_3; - - FLOAT_V_T vres3_0; - FLOAT_V_T vres3_1; - FLOAT_V_T vres3_2; - FLOAT_V_T vres3_3; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - - for (j=0; j 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); - vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); - vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); - - ptrba += vl * 4; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - if (!backwards) { - temp = bk-off; - temp = left ? temp - 4 : // number of values in A - temp - 4; // number of values in B - - ptrba += temp*4; // number of values in A - ptrbb += temp*4; // number of values in B - } -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - C2 = C2+4; - C3 = C3+4; - - } - - if ( bm & 2 ) // do any 2x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres2_1 = VFMVVF_FLOAT(0, vlmax); - - vres3_0 = VFMVVF_FLOAT(0, vlmax); - vres3_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+4; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - ptrba += vl * 2; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); - - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - C2 = C2+2; - C3 = C3+2; - - } - - if ( bm & 1 ) // do any 1x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres3_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+4; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - C2 = C2+1; - C3 = C3+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; -#endif - - k = (bk<<2); - bb = bb+k; - i = (ldc<<2); - C = C+i; - } - - for (j=0; j<(bn&2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - - ptrba += vl * 4; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) // do any 2x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+2; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) // do any 1x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*2; -#endif - - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - - ptrba += vl * 4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - ptrba += vl * 2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - - } - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} From 9702d57b11351a5360a2f0326c69c3f550c784d2 Mon Sep 17 00:00:00 2001 From: HellerZheng Date: Wed, 16 Nov 2022 11:11:04 +0800 Subject: [PATCH 3/5] Update Makefile.install --- Makefile.install | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index 168d08f72..87b5bc870 100644 --- a/Makefile.install +++ b/Makefile.install @@ -202,5 +202,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - - From 3918d8504e7720d94221025ae6078a2459ccb104 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 21 Nov 2022 19:06:07 -0800 Subject: [PATCH 4/5] nrm2 simple optimization --- kernel/riscv64/nrm2_rvv.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 3f5d50397..979c31648 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -39,9 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT vfabs_v_f32m8 #define ABS fabsf #else #define VSETVL(n) vsetvl_e64m8(n) @@ -54,9 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT vfabs_v_f64m8 #define ABS fabs #endif @@ -68,12 +64,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if(n == 1) return (ABS(x[0])); FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_max, v_res; - FLOAT scale = 0.0, ssq = 0.0; + FLOAT_V_T_M1 v_res; + FLOAT ssq = 0.0; size_t vlmax = VSETVL_MAX; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); vr = VFMVVF_FLOAT(0, vlmax); @@ -83,9 +78,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLEV_FLOAT(x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } @@ -98,20 +90,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } - } v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - return(scale * sqrt(ssq)); + return sqrt(ssq); } From 387e8970cd8ce581a6c7bc48418860966140f621 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 28 Nov 2022 21:42:29 -0800 Subject: [PATCH 5/5] Fix merge problem; Update compiling COMMON_OPT per review comments. --- Makefile.prebuild | 2 +- Makefile.riscv64 | 6 +++--- common_riscv64.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index e6a8eab59..c4f4a2602 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -56,7 +56,7 @@ TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif ifeq ($(TARGET), x280) -TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif ifeq ($(TARGET), RISCV64_GENERIC) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index d6eaf552d..d091984a6 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,10 +3,10 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math -FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static -endif \ No newline at end of file +endif diff --git a/common_riscv64.h b/common_riscv64.h index 221a79901..2092bd5ab 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) -#include +#include #endif #if defined(x280)