Merge branch 'develop' of https://github.com/HellerZheng/OpenBLAS_riscv_x280 into HellerZheng-develop
This commit is contained in:
commit
e5313f53d5
|
@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V)
|
|||
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), x280)
|
||||
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), RISCV64_GENERIC)
|
||||
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
|
||||
endif
|
||||
|
||||
all: getarch_2nd
|
||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
|
|
@ -2,3 +2,11 @@ ifeq ($(CORE), C910V)
|
|||
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
|
||||
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
|
||||
endif
|
||||
ifeq ($(CORE), x280)
|
||||
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math
|
||||
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_GENERIC)
|
||||
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
|
||||
endif
|
||||
|
|
|
@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
```
|
||||
(also known to work on C906)
|
||||
|
||||
- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0.
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
|
||||
|
|
|
@ -120,6 +120,7 @@ Z14
|
|||
10.RISC-V 64:
|
||||
RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
|
||||
C910V
|
||||
x280
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSONGENERIC
|
||||
|
|
6884
benchmark/Makefile
6884
benchmark/Makefile
File diff suppressed because it is too large
Load Diff
|
@ -95,4 +95,8 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(x280)
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
#define CPU_x280 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"RISCV64_GENERIC",
|
||||
"C910V"
|
||||
"x280"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
|
12
getarch.c
12
getarch.c
|
@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define LIBNAME "c910v"
|
||||
#define CORENAME "C910V"
|
||||
#endif
|
||||
#endif
|
||||
#ifdef FORCE_x280
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#define SUBARCHITECTURE "x280"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-Dx280 " \
|
||||
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "x280"
|
||||
#define CORENAME "x280"
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,235 @@
|
|||
# **********************************************************************************
|
||||
# Copyright (c) 2022, The OpenBLAS Project
|
||||
# All rights reserved.
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# 3. Neither the name of the OpenBLAS project nor the names of
|
||||
# its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# **********************************************************************************
|
||||
|
||||
SAMAXKERNEL = amax_rvv.c
|
||||
DAMAXKERNEL = amax_rvv.c
|
||||
CAMAXKERNEL = zamax_rvv.c
|
||||
ZAMAXKERNEL = zamax_rvv.c
|
||||
|
||||
SAMINKERNEL = amin_rvv.c
|
||||
DAMINKERNEL = amin_rvv.c
|
||||
CAMINKERNEL = zamin_rvv.c
|
||||
ZAMINKERNEL = zamin_rvv.c
|
||||
|
||||
SMAXKERNEL = max_rvv.c
|
||||
DMAXKERNEL = max_rvv.c
|
||||
|
||||
SMINKERNEL = min_rvv.c
|
||||
DMINKERNEL = min_rvv.c
|
||||
|
||||
ISAMAXKERNEL = iamax_rvv.c
|
||||
IDAMAXKERNEL = iamax_rvv.c
|
||||
ICAMAXKERNEL = izamax_rvv.c
|
||||
IZAMAXKERNEL = izamax_rvv.c
|
||||
|
||||
ISAMINKERNEL = iamin_rvv.c
|
||||
IDAMINKERNEL = iamin_rvv.c
|
||||
ICAMINKERNEL = izamin_rvv.c
|
||||
IZAMINKERNEL = izamin_rvv.c
|
||||
|
||||
ISMAXKERNEL = imax_rvv.c
|
||||
IDMAXKERNEL = imax_rvv.c
|
||||
|
||||
ISMINKERNEL = imin_rvv.c
|
||||
IDMINKERNEL = imin_rvv.c
|
||||
|
||||
SASUMKERNEL = asum_rvv.c
|
||||
DASUMKERNEL = asum_rvv.c
|
||||
CASUMKERNEL = zasum_rvv.c
|
||||
ZASUMKERNEL = zasum_rvv.c
|
||||
|
||||
SSUMKERNEL = sum_rvv.c
|
||||
DSUMKERNEL = sum_rvv.c
|
||||
CSUMKERNEL = zsum_rvv.c
|
||||
ZSUMKERNEL = zsum_rvv.c
|
||||
|
||||
SAXPYKERNEL = axpy_rvv.c
|
||||
DAXPYKERNEL = axpy_rvv.c
|
||||
CAXPYKERNEL = zaxpy_rvv.c
|
||||
ZAXPYKERNEL = zaxpy_rvv.c
|
||||
|
||||
SAXPBYKERNEL = axpby_rvv.c
|
||||
DAXPBYKERNEL = axpby_rvv.c
|
||||
CAXPBYKERNEL = zaxpby_rvv.c
|
||||
ZAXPBYKERNEL = zaxpby_rvv.c
|
||||
|
||||
SCOPYKERNEL = copy_rvv.c
|
||||
DCOPYKERNEL = copy_rvv.c
|
||||
CCOPYKERNEL = zcopy_rvv.c
|
||||
ZCOPYKERNEL = zcopy_rvv.c
|
||||
|
||||
SDOTKERNEL = dot_rvv.c
|
||||
DDOTKERNEL = dot_rvv.c
|
||||
CDOTKERNEL = zdot_rvv.c
|
||||
ZDOTKERNEL = zdot_rvv.c
|
||||
DSDOTKERNEL = dot_rvv.c
|
||||
|
||||
SNRM2KERNEL = nrm2_rvv.c
|
||||
DNRM2KERNEL = nrm2_rvv.c
|
||||
CNRM2KERNEL = znrm2_rvv.c
|
||||
ZNRM2KERNEL = znrm2_rvv.c
|
||||
|
||||
SROTKERNEL = rot_rvv.c
|
||||
DROTKERNEL = rot_rvv.c
|
||||
CROTKERNEL = zrot_rvv.c
|
||||
ZROTKERNEL = zrot_rvv.c
|
||||
|
||||
SSCALKERNEL = scal_rvv.c
|
||||
DSCALKERNEL = scal_rvv.c
|
||||
CSCALKERNEL = zscal_rvv.c
|
||||
ZSCALKERNEL = zscal_rvv.c
|
||||
|
||||
SSWAPKERNEL = swap_rvv.c
|
||||
DSWAPKERNEL = swap_rvv.c
|
||||
CSWAPKERNEL = zswap_rvv.c
|
||||
ZSWAPKERNEL = zswap_rvv.c
|
||||
|
||||
SGEMVNKERNEL = gemv_n_rvv.c
|
||||
DGEMVNKERNEL = gemv_n_rvv.c
|
||||
CGEMVNKERNEL = zgemv_n_rvv.c
|
||||
ZGEMVNKERNEL = zgemv_n_rvv.c
|
||||
|
||||
SGEMVTKERNEL = gemv_t_rvv.c
|
||||
DGEMVTKERNEL = gemv_t_rvv.c
|
||||
CGEMVTKERNEL = zgemv_t_rvv.c
|
||||
ZGEMVTKERNEL = zgemv_t_rvv.c
|
||||
|
||||
CTRMMKERNEL = ztrmmkernel_2x2_rvv.c
|
||||
ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c
|
||||
|
||||
# SGEMM_UNROLL_N set in params.h
|
||||
ifeq ($(SGEMM_UNROLL_N), 8)
|
||||
# UNROLL_M is VLMAX
|
||||
SGEMMKERNEL = gemmkernel_rvv_v1x8.c
|
||||
SGEMMINCOPY = gemm_ncopy_rvv_v1.c
|
||||
SGEMMITCOPY = gemm_tcopy_rvv_v1.c
|
||||
SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
|
||||
SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMKERNEL = trmmkernel_rvv_v1x8.c
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_rvv_v1.c
|
||||
SSYMMLCOPY_M = symm_lcopy_rvv_v1.c
|
||||
endif
|
||||
|
||||
# SGEMM_UNROLL_N set in params.h
|
||||
ifeq ($(DGEMM_UNROLL_N), 8)
|
||||
# UNROLL_M is VLMAX
|
||||
DGEMMKERNEL = gemmkernel_rvv_v1x8.c
|
||||
DGEMMINCOPY = gemm_ncopy_rvv_v1.c
|
||||
DGEMMITCOPY = gemm_tcopy_rvv_v1.c
|
||||
DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c
|
||||
DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = trmmkernel_rvv_v1x8.c
|
||||
DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_rvv_v1.c
|
||||
DSYMMLCOPY_M = symm_lcopy_rvv_v1.c
|
||||
endif
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c
|
||||
|
||||
SSYMV_U_KERNEL = symv_U_rvv.c
|
||||
SSYMV_L_KERNEL = symv_L_rvv.c
|
||||
DSYMV_U_KERNEL = symv_U_rvv.c
|
||||
DSYMV_L_KERNEL = symv_L_rvv.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
SGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
ifndef DGEMM_BETA
|
||||
DGEMM_BETA = gemm_beta_rvv.c
|
||||
endif
|
||||
ifndef CGEMM_BETA
|
||||
CGEMM_BETA = zgemm_beta_rvv.c
|
||||
endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = zgemm_beta_rvv.c
|
||||
endif
|
|
@ -0,0 +1,102 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VFABSV_FLOAT vfabs_v_f32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VFABSV_FLOAT vfabs_v_f64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
FLOAT_V_T vx, vmax;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmax = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(maxf);
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VFABSV_FLOAT vfabs_v_f32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VFABSV_FLOAT vfabs_v_f64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
FLOAT_V_T vx, vmin;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmin = VFMINVV_FLOAT(vmin, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vmin = VFMINVV_FLOAT(vmin, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(minf);
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VFABSV_FLOAT vfabs_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VFABSV_FLOAT vfabs_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT asumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
|
||||
FLOAT_V_T vx, vsum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vsum = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vsum = VFADDVV_FLOAT(vsum, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
vsum = VFADDVV_FLOAT(vsum, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax);
|
||||
asumf = VFMVFS_FLOAT_M1(v_res);
|
||||
return(asumf);
|
||||
}
|
|
@ -0,0 +1,171 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
FLOAT_V_T vx, vy;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
|
||||
if ( beta == 0.0 ) {
|
||||
if ( alpha == 0.0 ) {
|
||||
if (1 == inc_y) {
|
||||
memset(&y[0], 0, n * sizeof(FLOAT));
|
||||
} else {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
size_t vl = VSETVL(n);
|
||||
vy = VFMVVF_FLOAT(0.0, vl);
|
||||
for ( ; n > 0; n -= vl, y += vl*stride_y) {
|
||||
vl = VSETVL(n);
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ((1 == inc_x) && (1 == inc_y)) {
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_x) {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_y) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VFMULVF_FLOAT(vx, alpha, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ( alpha == 0.0 ) {
|
||||
if (1 == inc_y) {
|
||||
for (size_t vl; n > 0; n -= vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if ((1 == inc_x) && (1 == inc_y)) {
|
||||
for (size_t vl; n > 0; n -= vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_x) {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
} else if (1 == inc_y) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMULVF_FLOAT(vy, beta, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
|
||||
VSSEV_FLOAT (y, stride_y, vy, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,109 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
if ( n <= 0 ) return(0);
|
||||
if ( da == 0.0 ) return(0);
|
||||
|
||||
FLOAT_V_T vx, vy;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSEV_FLOAT (y, vy, vl);
|
||||
}
|
||||
|
||||
} else if (1 == inc_y) {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSEV_FLOAT(y, vy, vl);
|
||||
}
|
||||
|
||||
} else if (1 == inc_x) {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
if(n < 0) return(0);
|
||||
|
||||
FLOAT_V_T v0;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
VSEV_FLOAT(y, v0, vl);
|
||||
}
|
||||
|
||||
} else if (inc_y == 1) {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
VSEV_FLOAT(y, v0, vl);
|
||||
}
|
||||
|
||||
} else if(inc_x == 1) {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
double dot = 0.0;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
size_t vlmax = vsetvlmax_e64m8();
|
||||
vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax);
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = vle32_v_f32m4(x, vl);
|
||||
vfloat32m4_t vy = vle32_v_f32m4(y, vl);
|
||||
|
||||
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = vle64_v_f64m8(x, vl);
|
||||
vfloat64m8_t vy = vle64_v_f64m8(y, vl);
|
||||
|
||||
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
|
||||
} else if (1 == inc_x) {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = vle32_v_f32m4(x, vl);
|
||||
vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
|
||||
|
||||
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = vle64_v_f64m8(x, vl);
|
||||
vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
|
||||
|
||||
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
} else if (1 == inc_y) {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
|
||||
vfloat32m4_t vy = vle32_v_f32m4(y, vl);
|
||||
|
||||
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
|
||||
vfloat64m8_t vy = vle64_v_f64m8(y, vl);
|
||||
|
||||
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = vsetvl_e64m8(n);
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
|
||||
vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
|
||||
|
||||
vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#else
|
||||
vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
|
||||
vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
|
||||
|
||||
vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax);
|
||||
vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax);
|
||||
dot = vfmv_f_s_f64m1_f64(vec_sum);
|
||||
|
||||
return(dot);
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_beta.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
||||
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
|
||||
FLOAT *c, BLASLONG ldc)
|
||||
{
|
||||
BLASLONG chunk;
|
||||
FLOAT *c_offset;
|
||||
size_t vl;
|
||||
FLOAT_V_T vx;
|
||||
|
||||
if (beta == ZERO) {
|
||||
|
||||
vl = VSETVL(m);
|
||||
vx = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
for( ; n > 0; n--, c += ldc) {
|
||||
c_offset = c;
|
||||
|
||||
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
|
||||
vl = VSETVL(chunk);
|
||||
|
||||
VSEV_FLOAT(c_offset, vx, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for( ; n > 0; n--, c += ldc) {
|
||||
c_offset = c;
|
||||
|
||||
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
|
||||
vl = VSETVL(chunk);
|
||||
|
||||
vx = VLEV_FLOAT(c_offset, vl);
|
||||
vx = VFMULVF_FLOAT(vx, beta, vl);
|
||||
VSEV_FLOAT(c_offset, vx, vl);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m1(n)
|
||||
#define FLOAT_V_T vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m1
|
||||
#define VSEV_FLOAT vse32_v_f32m1
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m1
|
||||
#define VSSEG4_FLOAT vsseg4e32_v_f32m1
|
||||
#define VSSEG8_FLOAT vsseg8e32_v_f32m1
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m1(n)
|
||||
#define FLOAT_V_T vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m1
|
||||
#define VSEV_FLOAT vse64_v_f64m1
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m1
|
||||
#define VSSEG4_FLOAT vsseg4e64_v_f64m1
|
||||
#define VSSEG8_FLOAT vsseg8e64_v_f64m1
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_ncopy_8.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *a_offset;
|
||||
FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8;
|
||||
FLOAT *b_offset;
|
||||
|
||||
FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for(j = (n >> 3); j > 0; j--) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset5 = a_offset4 + lda;
|
||||
a_offset6 = a_offset5 + lda;
|
||||
a_offset7 = a_offset6 + lda;
|
||||
a_offset8 = a_offset7 + lda;
|
||||
a_offset += 8 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
v3 = VLEV_FLOAT(a_offset3, vl);
|
||||
v4 = VLEV_FLOAT(a_offset4, vl);
|
||||
v5 = VLEV_FLOAT(a_offset5, vl);
|
||||
v6 = VLEV_FLOAT(a_offset6, vl);
|
||||
v7 = VLEV_FLOAT(a_offset7, vl);
|
||||
v8 = VLEV_FLOAT(a_offset8, vl);
|
||||
|
||||
VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
a_offset3 += vl;
|
||||
a_offset4 += vl;
|
||||
a_offset5 += vl;
|
||||
a_offset6 += vl;
|
||||
a_offset7 += vl;
|
||||
a_offset8 += vl;
|
||||
b_offset += vl*8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
v3 = VLEV_FLOAT(a_offset3, vl);
|
||||
v4 = VLEV_FLOAT(a_offset4, vl);
|
||||
|
||||
VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
a_offset3 += vl;
|
||||
a_offset4 += vl;
|
||||
b_offset += vl*4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
|
||||
VSSEG2_FLOAT(b_offset, v1, v2, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
b_offset += vl*2;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
a_offset1 = a_offset;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
|
||||
VSEV_FLOAT(b_offset, v1, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
b_offset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *a_offset;
|
||||
FLOAT *a_offset1;
|
||||
FLOAT *b_offset;
|
||||
|
||||
FLOAT_V_T v0;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for(j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
a_offset1 = a_offset;
|
||||
a_offset += vl * lda;
|
||||
|
||||
for(i = m; i > 0; i--) {
|
||||
v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(b_offset, v0, vl);
|
||||
|
||||
a_offset1++;
|
||||
b_offset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,264 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m1(n)
|
||||
#define FLOAT_V_T vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m1
|
||||
#define VLSEV_FLOAT vlse32_v_f32m1
|
||||
#define VSEV_FLOAT vse32_v_f32m1
|
||||
#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m1
|
||||
#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1
|
||||
#define VSSEG4_FLOAT vsseg4e32_v_f32m1
|
||||
#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1
|
||||
#define VSSEG8_FLOAT vsseg8e32_v_f32m1
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m1(n)
|
||||
#define FLOAT_V_T vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m1
|
||||
#define VLSEV_FLOAT vlse64_v_f64m1
|
||||
#define VSEV_FLOAT vse64_v_f64m1
|
||||
#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m1
|
||||
#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1
|
||||
#define VSSEG4_FLOAT vsseg4e64_v_f64m1
|
||||
#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1
|
||||
#define VSSEG8_FLOAT vsseg8e64_v_f64m1
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1;
|
||||
|
||||
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||
|
||||
FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
|
||||
// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
boffset2 = b + m * (n & ~7);
|
||||
boffset3 = b + m * (n & ~3);
|
||||
boffset4 = b + m * (n & ~1);
|
||||
|
||||
for(j = (m >> 3); j > 0; j--) {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 64;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 8;
|
||||
|
||||
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 8;
|
||||
|
||||
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
boffset2 += 32;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 8;
|
||||
|
||||
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG2_FLOAT(boffset3, v0, v1, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
boffset3 += 16;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
size_t vl = 8;
|
||||
|
||||
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(boffset4, v0, vl);
|
||||
|
||||
aoffset1 += 1;
|
||||
boffset4 += 8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 32;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 4;
|
||||
|
||||
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 4;
|
||||
|
||||
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
boffset2 += 16;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 4;
|
||||
|
||||
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG2_FLOAT(boffset3, v0, v1, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
boffset3 += 8;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
size_t vl = 4;
|
||||
|
||||
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(boffset4, v0, vl);
|
||||
|
||||
aoffset1 += 1;
|
||||
boffset4 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset += 2 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 16;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 2;
|
||||
|
||||
VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 2;
|
||||
|
||||
VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
boffset2 += 8;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 2;
|
||||
|
||||
VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSSEG2_FLOAT(boffset3, v0, v1, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
boffset3 += 4;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
size_t vl = 2;
|
||||
|
||||
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
|
||||
VSEV_FLOAT(boffset4, v0, vl);
|
||||
|
||||
aoffset1 += 1;
|
||||
boffset4 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
aoffset1 = aoffset;
|
||||
boffset1 = boffset;
|
||||
|
||||
for(i = (n >> 3); i > 0; i--) {
|
||||
size_t vl = 8;
|
||||
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset1, v0, vl);
|
||||
|
||||
aoffset1 += 8;
|
||||
boffset1 += 8 * m;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
size_t vl = 4;
|
||||
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset2, v0, vl);
|
||||
|
||||
aoffset1 += 4;
|
||||
//boffset2 += 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
size_t vl = 2;
|
||||
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset3, v0, vl);
|
||||
|
||||
aoffset1 += 2;
|
||||
// boffset3 += 2;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
*(boffset4) = *(aoffset1);
|
||||
// aoffset1 ++;
|
||||
// boffset4 ++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1;
|
||||
IFLOAT *boffset;
|
||||
|
||||
FLOAT_V_T v0;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
for(j = n; j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
aoffset1 = aoffset;
|
||||
aoffset += vl;
|
||||
|
||||
for(i = m; i > 0; i--) {
|
||||
v0 = VLEV_FLOAT(aoffset1, vl);
|
||||
VSEV_FLOAT(boffset, v0, vl);
|
||||
|
||||
aoffset1 += lda;
|
||||
boffset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,601 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7;
|
||||
IFLOAT *ptrba,*ptrbb;
|
||||
|
||||
//fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug
|
||||
|
||||
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
|
||||
size_t vl;
|
||||
|
||||
// N:8
|
||||
for (j = bn/8; j > 0; j--) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
C2 = C1 + ldc;
|
||||
C3 = C2 + ldc;
|
||||
C4 = C3 + ldc;
|
||||
C5 = C4 + ldc;
|
||||
C6 = C5 + ldc;
|
||||
C7 = C6 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres4 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vl);
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 8;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
|
||||
ptrbb += 8;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
|
||||
ptrbb += 8;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VLEV_FLOAT(C1, vl);
|
||||
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VLEV_FLOAT(C2, vl);
|
||||
va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VLEV_FLOAT(C3, vl);
|
||||
va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
va4 = VLEV_FLOAT(C4, vl);
|
||||
va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl);
|
||||
VSEV_FLOAT(C4, va4, vl);
|
||||
|
||||
va5 = VLEV_FLOAT(C5, vl);
|
||||
va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl);
|
||||
VSEV_FLOAT(C5, va5, vl);
|
||||
|
||||
va6 = VLEV_FLOAT(C6, vl);
|
||||
va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl);
|
||||
VSEV_FLOAT(C6, va6, vl);
|
||||
|
||||
va7 = VLEV_FLOAT(C7, vl);
|
||||
va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl);
|
||||
VSEV_FLOAT(C7, va7, vl);
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
C4 += vl;
|
||||
C5 += vl;
|
||||
C6 += vl;
|
||||
C7 += vl;
|
||||
}
|
||||
|
||||
bb += (bk<<3);
|
||||
C += (ldc<<3);
|
||||
}
|
||||
|
||||
// N:4
|
||||
if (bn & 4) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
C2 = C1 + ldc;
|
||||
C3 = C2 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 4;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
ptrbb += 4;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
ptrbb += 4;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
ptrbb += 4;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
ptrbb += 4;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
ptrbb += 4;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
ptrbb += 4;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
ptrbb += 4;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
ptrbb += 4;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
|
||||
ptrbb += 4;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VLEV_FLOAT(C1, vl);
|
||||
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VLEV_FLOAT(C2, vl);
|
||||
va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VLEV_FLOAT(C3, vl);
|
||||
va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
}
|
||||
|
||||
bb += (bk<<2);
|
||||
C += (ldc<<2);
|
||||
}
|
||||
|
||||
// N:2
|
||||
if (bn & 2) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 2;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
ptrbb += 2;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
ptrbb += 2;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
ptrbb += 2;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
ptrbb += 2;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
ptrbb += 2;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
ptrbb += 2;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
ptrbb += 2;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
ptrbb += 2;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
|
||||
ptrbb += 2;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VLEV_FLOAT(C1, vl);
|
||||
va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
}
|
||||
|
||||
bb += (bk<<1);
|
||||
C += (ldc<<1);
|
||||
}
|
||||
|
||||
// N:1
|
||||
if (bn & 1) {
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
#if 0
|
||||
for (k = bk; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += 1;
|
||||
}
|
||||
#else
|
||||
// Unroll K
|
||||
for (k = bk/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
ptrbb += 1;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
ptrbb += 1;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
ptrbb += 1;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
ptrbb += 1;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
ptrbb += 1;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
ptrbb += 1;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
ptrbb += 1;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
ptrbb += 1;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = bk&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
|
||||
ptrbb += 1;
|
||||
ptrba += vl;
|
||||
}
|
||||
#endif
|
||||
va0 = VLEV_FLOAT(C0, vl);
|
||||
va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
C0 += vl;
|
||||
}
|
||||
|
||||
bb += (bk);
|
||||
C += (ldc);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
if(n < 0) return(0);
|
||||
|
||||
FLOAT *a_ptr, *x_ptr;
|
||||
BLASLONG i;
|
||||
FLOAT_V_T va, vy;
|
||||
|
||||
if(inc_y == 1) {
|
||||
|
||||
for (size_t vl; m > 0; m -= vl, y += vl, a += vl) {
|
||||
vl = VSETVL(m);
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
for(i = 0; i < n; i++) {
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
VSEV_FLOAT(y, vy, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) {
|
||||
vl = VSETVL(m);
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
for(i = 0; i < n; i++) {
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *a_ptr, *x_ptr;
|
||||
|
||||
FLOAT_V_T va, vx, vr;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
size_t vlmax = VSETVL_MAX_M1;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
vlmax = VSETVL_MAX;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for(i = 0; i < n; i++) {
|
||||
j = m;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vx = VLEV_FLOAT(x_ptr, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, va, vx, vl);
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
*y += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
y += inc_y;
|
||||
a += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for(i = 0; i < n; i++) {
|
||||
j = m;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
va = VLEV_FLOAT(a_ptr, vl);
|
||||
vx = VLSEV_FLOAT(x_ptr, stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, va, vx, vl);
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
*y += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
y += inc_y;
|
||||
a += lda;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT vfabs_v_f64m8
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT vfabs_v_f32m8
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_max_index = VMVVX_UINT(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT(-1, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
|
||||
max_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
|
||||
max_index = VMVVXS_UINT(v_max_index);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT vfabs_v_f64m8
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT vfabs_v_f32m8
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_min_index = VMVVX_UINT(0, vlmax);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vx = VFABSV_FLOAT(vx, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
|
||||
min_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
|
||||
min_index = VMVVXS_UINT(v_min_index);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
|
||||
FLOAT_V_T vx, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_max_index = VMVVX_UINT(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx, vl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res, v_min;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
|
||||
max_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
|
||||
max_index = VMVVXS_UINT(v_max_index);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint64m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m8_m
|
||||
#define VIDV_UINT vid_v_u64m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
|
||||
#define VADDVX_UINT vadd_vx_u64m8
|
||||
#define VMVVX_UINT vmv_v_x_u64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
|
||||
#define VMVVXS_UINT vmv_x_s_u64m8_u64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define MASK_T vbool4_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VFIRSTM vfirst_m_b4
|
||||
#define UINT_V_T vuint32m8_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m8_m
|
||||
#define VIDV_UINT vid_v_u32m8
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
|
||||
#define VADDVX_UINT vadd_vx_u32m8
|
||||
#define VMVVX_UINT vmv_v_x_u32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
|
||||
#define VMVVXS_UINT vmv_x_s_u32m8_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
|
||||
FLOAT_V_T vx, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_min_index = VMVVX_UINT(0, vlmax);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
|
||||
min_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
|
||||
min_index = VMVVXS_UINT(v_min_index);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
|
@ -0,0 +1,162 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
|
||||
#define MASK_T vbool16_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT vfabs_v_f64m4
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m4
|
||||
#define VFIRSTM vfirst_m_b16
|
||||
#define UINT_V_T vuint64m4_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m4_m
|
||||
#define VIDV_UINT vid_v_u64m4
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m4_m
|
||||
#define VADDVX_UINT vadd_vx_u64m4
|
||||
#define VMVVX_UINT vmv_v_x_u64m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
|
||||
#define VMVVXS_UINT vmv_x_s_u64m4_u64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
|
||||
#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT vfabs_v_f32m4
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m4
|
||||
#define VFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint32m4_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m4_m
|
||||
#define VIDV_UINT vid_v_u32m4
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m4_m
|
||||
#define VADDVX_UINT vadd_vx_u32m4
|
||||
#define VMVVX_UINT vmv_v_x_u32m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
|
||||
#define VMVVXS_UINT vmv_x_s_u32m4_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int max_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(max_index);
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_max;
|
||||
UINT_V_T v_max_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_max_index = VMVVX_UINT(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT(-1, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx0, vl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
//index where element greater than v_max
|
||||
mask = VMFLTVV_FLOAT(v_max, vx0, vl);
|
||||
v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
|
||||
v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
|
||||
|
||||
//update v_max and start_index j
|
||||
v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
|
||||
max_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
|
||||
max_index = VMVVXS_UINT(v_max_index);
|
||||
|
||||
return(max_index+1);
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
|
||||
#define MASK_T vbool16_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFABSV_FLOAT vfabs_v_f64m4
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m4
|
||||
#define VFIRSTM vfirst_m_b16
|
||||
#define UINT_V_T vuint64m4_t
|
||||
#define VIDV_MASK_UINT vid_v_u64m4_m
|
||||
#define VIDV_UINT vid_v_u64m4
|
||||
#define VADDVX_MASK_UINT vadd_vx_u64m4_m
|
||||
#define VADDVX_UINT vadd_vx_u64m4
|
||||
#define VMVVX_UINT vmv_v_x_u64m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
|
||||
#define VMVVXS_UINT vmv_x_s_u64m4_u64
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
|
||||
#define MASK_T vbool8_t
|
||||
#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
|
||||
#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
|
||||
#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFABSV_FLOAT vfabs_v_f32m4
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m4
|
||||
#define VFIRSTM vfirst_m_b8
|
||||
#define UINT_V_T vuint32m4_t
|
||||
#define VIDV_MASK_UINT vid_v_u32m4_m
|
||||
#define VIDV_UINT vid_v_u32m4
|
||||
#define VADDVX_MASK_UINT vadd_vx_u32m4_m
|
||||
#define VADDVX_UINT vadd_vx_u32m4
|
||||
#define VMVVX_UINT vmv_v_x_u32m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
|
||||
#define VMVVXS_UINT vmv_x_s_u32m4_u32
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
unsigned int min_index = 0;
|
||||
if (n <= 0 || inc_x <= 0) return(min_index);
|
||||
|
||||
FLOAT_V_T vx0, vx1, v_min;
|
||||
UINT_V_T v_min_index;
|
||||
MASK_T mask;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_min_index = VMVVX_UINT(0, vlmax);
|
||||
v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
BLASLONG j=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx0, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
|
||||
vx0 = VFABSV_FLOAT(vx0, vl);
|
||||
vx1 = VFABSV_FLOAT(vx1, vl);
|
||||
|
||||
vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
|
||||
|
||||
// index where element less than v_min
|
||||
mask = VMFLTVV_FLOAT(vx0, v_min, vl);
|
||||
v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
|
||||
v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
|
||||
|
||||
//update v_min and start_index j
|
||||
v_min = VFMINVV_FLOAT(v_min, vx0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_res, v_max;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
|
||||
min_index = VFIRSTM(mask, vlmax);
|
||||
|
||||
v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
|
||||
min_index = VMVVXS_UINT(v_min_index);
|
||||
|
||||
return(min_index+1);
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT maxf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
FLOAT_V_T vx, vmax;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vmax = VFMAXVV_FLOAT(vmax, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(maxf);
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m8
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
FLOAT_V_T vx, vmin;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vmin = VFMINVV_FLOAT(vmin, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vmin = VFMINVV_FLOAT(vmin, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(minf);
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define ABS fabsf
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define ABS fabs
|
||||
#endif
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
|
||||
if( n <= 0 ) return(0.0);
|
||||
if(n == 1) return (ABS(x[0]));
|
||||
|
||||
FLOAT_V_T vr, v0;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
FLOAT ssq = 0.0;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl * inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
|
||||
}
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
|
||||
|
||||
ssq = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return sqrt(ssq);
|
||||
}
|
|
@ -0,0 +1,149 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
|
||||
if(n <= 0) return(0);
|
||||
|
||||
FLOAT_V_T v0, v1, vx, vy;
|
||||
|
||||
if (inc_x == 0 || inc_y == 0) {
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[ix] + s*y[iy] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
x[ix] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSEV_FLOAT(x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSEV_FLOAT(y, v1, vl);
|
||||
}
|
||||
|
||||
} else if(inc_y == 1) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSSEV_FLOAT(x, stride_x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSEV_FLOAT(y, v1, vl);
|
||||
}
|
||||
|
||||
} else if(inc_x == 1) {
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSEV_FLOAT(x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v1, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx, c, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
|
||||
VSSEV_FLOAT(x, stride_x, v0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx, s, vl);
|
||||
v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
|
||||
VSSEV_FLOAT(y, stride_y, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
if ( (n <= 0) || (inc_x <= 0)) return(0);
|
||||
|
||||
FLOAT_V_T v0;
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
v0 = VFMULVF_FLOAT(v0, da, vl);
|
||||
VSEV_FLOAT(x, v0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
v0 = VFMULVF_FLOAT(v0, da, vl);
|
||||
VSSEV_FLOAT(x, stride_x, v0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
FLOAT_V_T vx, vsum;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vsum = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vsum = VFADDVV_FLOAT(vsum, vx, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vsum = VFADDVV_FLOAT(vsum, vx, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax);
|
||||
sumf = VFMVFS_FLOAT_M1(v_res);
|
||||
return(sumf);
|
||||
}
|
|
@ -0,0 +1,142 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG stride_x, stride_y;
|
||||
FLOAT_V_T vx, vy;
|
||||
|
||||
if (n <= 0) return(0);
|
||||
|
||||
if (inc_x == 0 && inc_y == 0) {
|
||||
if (n & 1) {
|
||||
FLOAT temp = x[0];
|
||||
x[0] = y[0];
|
||||
y[0] = temp;
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 0) {
|
||||
FLOAT temp = x[0];
|
||||
x[0] = y[(n - 1) * inc_y];
|
||||
FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one
|
||||
stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse
|
||||
BLASLONG m = n - 1;
|
||||
for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) {
|
||||
vl = VSETVL(m);
|
||||
vy = VLSEV_FLOAT(ptr - 1, stride_y, vl);
|
||||
VSSEV_FLOAT(ptr, stride_y, vy, vl);
|
||||
}
|
||||
y[0] = temp;
|
||||
}
|
||||
else if(inc_y == 0) {
|
||||
FLOAT temp = y[0];
|
||||
y[0] = x[(n - 1) * inc_x];
|
||||
FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one
|
||||
stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse
|
||||
BLASLONG m = n - 1;
|
||||
for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) {
|
||||
vl = VSETVL(m);
|
||||
vx = VLSEV_FLOAT(ptr - 1, stride_x, vl);
|
||||
VSSEV_FLOAT(ptr, stride_x, vx, vl);
|
||||
}
|
||||
x[0] = temp;
|
||||
}
|
||||
else if(inc_x == 1 && inc_y == 1) {
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
VSEV_FLOAT(y, vx, vl);
|
||||
VSEV_FLOAT(x, vy, vl);
|
||||
}
|
||||
|
||||
} else if (inc_y == 1) {
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLEV_FLOAT(y, vl);
|
||||
VSEV_FLOAT(y, vx, vl);
|
||||
VSSEV_FLOAT(x, stride_x, vy, vl);
|
||||
}
|
||||
|
||||
} else if(inc_x == 1) {
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLEV_FLOAT(x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vx, vl);
|
||||
VSEV_FLOAT(x, vy, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VLSEV_FLOAT(y, stride_y, vl);
|
||||
VSSEV_FLOAT(y, stride_y, vx, vl);
|
||||
VSSEV_FLOAT(x, stride_x, vy, vl);
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define INT_V_T vint32m2_t
|
||||
#define VID_V_INT vid_v_i32m2
|
||||
#define VADD_VX_INT vadd_vx_i32m2
|
||||
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
|
||||
#define VBOOL_T vbool16_t
|
||||
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define INT_V_T vint64m2_t
|
||||
#define VID_V_INT vid_v_i64m2
|
||||
#define VADD_VX_INT vadd_vx_i64m2
|
||||
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
|
||||
#define VBOOL_T vbool32_t
|
||||
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/symm_lcopy_4.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, js, offset;
|
||||
|
||||
FLOAT *ao1, *ao2;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T vb, va1, va2;
|
||||
VBOOL_T vbool;
|
||||
INT_V_T vindex_max, vindex;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
vindex_max = VID_V_INT(vl);
|
||||
|
||||
for (js = n; js > 0; js -= vl, posX += vl) {
|
||||
vl = VSETVL(js);
|
||||
offset = posX - posY;
|
||||
|
||||
ao1 = a + posX + posY * lda;
|
||||
ao2 = a + posY + (posX) * lda;
|
||||
|
||||
for (i = m; i > 0; i--, offset--) {
|
||||
va2 = VLSEV_FLOAT(ao2, stride_lda, vl);
|
||||
va1 = VLEV_FLOAT(ao1, vl);
|
||||
|
||||
// offset > (0 - vindex) ---> (offset + vindex) > 0
|
||||
vindex = VADD_VX_INT(vindex_max, offset, vl);
|
||||
vbool = VMSGT_VX_INT(vindex, 0, vl);
|
||||
|
||||
vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
|
||||
b += vl;
|
||||
ao1 += lda;
|
||||
ao2++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define INT_V_T vint32m2_t
|
||||
#define VID_V_INT vid_v_i32m2
|
||||
#define VADD_VX_INT vadd_vx_i32m2
|
||||
#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
|
||||
#define VBOOL_T vbool16_t
|
||||
#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define INT_V_T vint64m2_t
|
||||
#define VID_V_INT vid_v_i64m2
|
||||
#define VADD_VX_INT vadd_vx_i64m2
|
||||
#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
|
||||
#define VBOOL_T vbool32_t
|
||||
#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/symm_ucopy_4.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, js, offset;
|
||||
|
||||
FLOAT *ao1, *ao2;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T vb, va1, va2;
|
||||
VBOOL_T vbool;
|
||||
INT_V_T vindex_max, vindex;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
vindex_max = VID_V_INT(vl);
|
||||
|
||||
for (js = n; js > 0; js -= vl, posX += vl) {
|
||||
vl = VSETVL(js);
|
||||
offset = posX - posY;
|
||||
|
||||
ao1 = a + posY + (posX + 0) * lda;
|
||||
ao2 = a + posX + 0 + posY * lda;
|
||||
|
||||
for (i = m; i > 0; i--, offset--) {
|
||||
va1 = VLSEV_FLOAT(ao1, stride_lda, vl);
|
||||
va2 = VLEV_FLOAT(ao2, vl);
|
||||
|
||||
// offset > (0 - vindex) ---> (offset + vindex) > 0
|
||||
vindex = VADD_VX_INT(vindex_max, offset, vl);
|
||||
vbool = VMSGT_VX_INT(vindex, 0, vl);
|
||||
|
||||
vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
|
||||
b += vl;
|
||||
ao1++;
|
||||
ao2 += lda;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,224 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
FLOAT temp1;
|
||||
FLOAT *a_ptr = a;
|
||||
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
size_t vlmax = VSETVL_MAX_M1, vl;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
vlmax = VSETVL_MAX;
|
||||
|
||||
FLOAT_V_T va, vx, vy, vr;
|
||||
BLASLONG stride_x, stride_y, inc_xv, inc_yv;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1)
|
||||
{
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
y[j] += temp1 * a_ptr[j];
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
|
||||
y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 1)
|
||||
{
|
||||
jy = 0;
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
y[jy] += temp1 * a_ptr[j];
|
||||
iy = jy + inc_y;
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_yv = inc_y * vl;
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
|
||||
y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
jy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else if(inc_y == 1)
|
||||
{
|
||||
jx = 0;
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
y[j] += temp1 * a_ptr[j];
|
||||
ix = jx + inc_x;
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
inc_xv = inc_x * vl;
|
||||
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
|
||||
ix += inc_xv;
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
|
||||
y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
jx += inc_x;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
jx = 0;
|
||||
jy = 0;
|
||||
for (j=0; j<offset; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
y[jy] += temp1 * a_ptr[j];
|
||||
ix = jx + inc_x;
|
||||
iy = jy + inc_y;
|
||||
i = j + 1;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
for (k = (m-i); k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_xv = inc_x * vl;
|
||||
inc_yv = inc_y * vl;
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
|
||||
ix += inc_xv;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
|
||||
y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VSEV_FLOAT vse32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VSSEV_FLOAT vsse32_v_f32m8
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m8
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VSEV_FLOAT vse64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VSSEV_FLOAT vsse64_v_f64m8
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m8
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m8
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
BLASLONG ix,iy;
|
||||
BLASLONG jx,jy;
|
||||
FLOAT temp1;
|
||||
FLOAT *a_ptr = a;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
size_t vl_max = VSETVL_MAX_M1, vl;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vl_max);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vl_max);
|
||||
vl_max = VSETVL_MAX;
|
||||
|
||||
FLOAT_V_T va, vx, vy, vr;
|
||||
BLASLONG stride_x, stride_y, inc_xv, inc_yv;
|
||||
|
||||
BLASLONG m1 = m - offset;
|
||||
if(inc_x == 1 && inc_y == 1)
|
||||
{
|
||||
a_ptr += m1 * lda;
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
|
||||
|
||||
y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 1)
|
||||
{
|
||||
jy = m1 * inc_y;
|
||||
a_ptr += m1 * lda;
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[j];
|
||||
iy = 0;
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_yv = inc_y * vl;
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLEV_FLOAT(&x[i], vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
|
||||
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
jy += inc_y;
|
||||
}
|
||||
}
|
||||
else if(inc_y == 1)
|
||||
{
|
||||
jx = m1 * inc_x;
|
||||
a_ptr += m1 * lda;
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
ix = 0;
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_xv = inc_x * vl;
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
|
||||
vy = VLEV_FLOAT(&y[i], vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSEV_FLOAT(&y[i], vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
|
||||
ix += inc_xv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
|
||||
|
||||
y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
jx += inc_x;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
jx = m1 * inc_x;
|
||||
jy = m1 * inc_y;
|
||||
a_ptr += m1 * lda;
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
for (j=m1; j<m; j++)
|
||||
{
|
||||
temp1 = alpha * x[jx];
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
i = 0;
|
||||
vr = VFMVVF_FLOAT(0, vl_max);
|
||||
for (k = j; k > 0; k -= vl, i += vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
inc_xv = inc_x * vl;
|
||||
inc_yv = inc_y * vl;
|
||||
vr = VFMVVF_FLOAT(0, vl);
|
||||
vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
|
||||
va = VLEV_FLOAT(&a_ptr[i], vl);
|
||||
vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
|
||||
VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
|
||||
|
||||
vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, vx, va, vl);
|
||||
ix += inc_xv;
|
||||
iy += inc_yv;
|
||||
}
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
|
||||
|
||||
y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
a_ptr += lda;
|
||||
jx += inc_x;
|
||||
jy += inc_y;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,138 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vint32m2_t
|
||||
#define VID_V_UINT vid_v_i32m2
|
||||
#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
|
||||
#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, X;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T vb, va1;
|
||||
|
||||
size_t vl;
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else if (X < posY)
|
||||
{
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
|
||||
X += vl;
|
||||
i += vl;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
|
||||
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, X;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
FLOAT_V_T vb, va1;
|
||||
size_t vl;
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY)
|
||||
{
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else if (X < posY)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao += lda;
|
||||
b += vl;
|
||||
}
|
||||
X += vl;
|
||||
i += vl;
|
||||
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
|
||||
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, X;
|
||||
BLASLONG stride_lda = sizeof(FLOAT) * lda;
|
||||
FLOAT *ao;
|
||||
|
||||
FLOAT_V_T vb, va1;
|
||||
size_t vl;
|
||||
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else if (X > posY)
|
||||
{
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X ++;
|
||||
i ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
|
||||
X += vl;
|
||||
i += vl;
|
||||
}
|
||||
}while (i < m);
|
||||
|
||||
posY += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT vid_v_u32m2
|
||||
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
|
||||
#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
|
||||
#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
|
||||
#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, j, js, X;
|
||||
|
||||
FLOAT *ao;
|
||||
FLOAT_V_T vb, va1;
|
||||
#ifdef UNIT
|
||||
VBOOL_T vbool_eq;
|
||||
#endif
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY)
|
||||
{
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
else
|
||||
{
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY)
|
||||
{
|
||||
ao ++;
|
||||
b += vl;
|
||||
X ++;
|
||||
i++;
|
||||
}
|
||||
else if (X > posY)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
ao += lda;
|
||||
b += vl;
|
||||
X++;
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
|
||||
#ifdef UNIT
|
||||
vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
|
||||
vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
|
||||
#endif
|
||||
VSEV_FLOAT(b, vb, vl);
|
||||
ao += lda;
|
||||
b += vl;
|
||||
}
|
||||
X += vl;
|
||||
i += vl;
|
||||
}
|
||||
}while (i < m);
|
||||
posY += vl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,685 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
// Optimizes the implementation in ../generic/trmmkernel_8x8.c
|
||||
|
||||
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
|
||||
{
|
||||
//fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc);
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
|
||||
|
||||
FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
|
||||
size_t vl;
|
||||
|
||||
BLASLONG off, temp;
|
||||
|
||||
#if !defined(LEFT)
|
||||
off = -offset;
|
||||
#else
|
||||
off = 0;
|
||||
#endif
|
||||
for (j = bn/8; j > 0; j--)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
C2 = C1+ldc;
|
||||
C3 = C2+ldc;
|
||||
C4 = C3+ldc;
|
||||
C5 = C4+ldc;
|
||||
C6 = C5+ldc;
|
||||
C7 = C6+ldc;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*8;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres4 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+8; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
ptrbb += 8;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
|
||||
ptrbb += 8;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
|
||||
ptrbb += 8;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
|
||||
ptrbb += 8;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
|
||||
ptrbb += 8;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
|
||||
ptrbb += 8;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
|
||||
ptrbb += 8;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
|
||||
ptrbb += 8;
|
||||
}
|
||||
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
|
||||
vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
|
||||
vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
|
||||
vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
|
||||
|
||||
ptrbb += 8;
|
||||
ptrba += vl;
|
||||
}
|
||||
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VFMULVF_FLOAT(vres2, alpha, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VFMULVF_FLOAT(vres3, alpha, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
va4 = VFMULVF_FLOAT(vres4, alpha, vl);
|
||||
VSEV_FLOAT(C4, va4, vl);
|
||||
|
||||
va5 = VFMULVF_FLOAT(vres5, alpha, vl);
|
||||
VSEV_FLOAT(C5, va5, vl);
|
||||
|
||||
va6 = VFMULVF_FLOAT(vres6, alpha, vl);
|
||||
VSEV_FLOAT(C6, va6, vl);
|
||||
|
||||
va7 = VFMULVF_FLOAT(vres7, alpha, vl);
|
||||
VSEV_FLOAT(C7, va7, vl);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 8; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*8;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
C4 += vl;
|
||||
C5 += vl;
|
||||
C6 += vl;
|
||||
C7 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 8;
|
||||
#endif
|
||||
|
||||
bb += (bk<<3);
|
||||
C += (ldc<<3);
|
||||
}
|
||||
|
||||
if (bn & 4)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
C2 = C1+ldc;
|
||||
C3 = C2+ldc;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*4;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+4; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
ptrbb += 4;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
|
||||
ptrbb += 4;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
|
||||
ptrbb += 4;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
|
||||
ptrbb += 4;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
|
||||
ptrbb += 4;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
|
||||
ptrbb += 4;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
|
||||
ptrbb += 4;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
|
||||
ptrbb += 4;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
|
||||
vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
|
||||
|
||||
ptrbb += 4;
|
||||
ptrba += vl;
|
||||
}
|
||||
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
va2 = VFMULVF_FLOAT(vres2, alpha, vl);
|
||||
VSEV_FLOAT(C2, va2, vl);
|
||||
|
||||
va3 = VFMULVF_FLOAT(vres3, alpha, vl);
|
||||
VSEV_FLOAT(C3, va3, vl);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 4; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*4;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
C2 += vl;
|
||||
C3 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4;
|
||||
#endif
|
||||
|
||||
bb += (bk<<2);
|
||||
C += (ldc<<2);
|
||||
}
|
||||
|
||||
if (bn & 2)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
ptrbb += 2;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
|
||||
ptrbb += 2;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
|
||||
ptrbb += 2;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
|
||||
ptrbb += 2;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
|
||||
ptrbb += 2;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
|
||||
ptrbb += 2;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
|
||||
ptrbb += 2;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
|
||||
ptrbb += 2;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
|
||||
|
||||
ptrbb += 2;
|
||||
ptrba += vl;
|
||||
}
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
va1 = VFMULVF_FLOAT(vres1, alpha, vl);
|
||||
VSEV_FLOAT(C1, va1, vl);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
C1 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
bb += (bk<<1);
|
||||
C += (ldc<<1);
|
||||
}
|
||||
|
||||
if (bn & 1)
|
||||
{
|
||||
C0 = C;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm; i > 0; i -= vl)
|
||||
{
|
||||
vl = VSETVL(i);
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*vl;
|
||||
ptrbb = bb + off*1;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+vl; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp/8; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
va1 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
ptrbb += 1;
|
||||
va2 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
|
||||
ptrbb += 1;
|
||||
va3 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
|
||||
ptrbb += 1;
|
||||
va4 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
|
||||
ptrbb += 1;
|
||||
va5 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
|
||||
ptrbb += 1;
|
||||
va6 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
|
||||
ptrbb += 1;
|
||||
va7 = VLEV_FLOAT(ptrba, vl);
|
||||
ptrba += vl;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
|
||||
ptrbb += 1;
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
|
||||
ptrbb += 1;
|
||||
}
|
||||
|
||||
// K remainder
|
||||
for (k = temp&7; k > 0; k--) {
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
|
||||
|
||||
ptrbb += 1;
|
||||
ptrba += vl;
|
||||
}
|
||||
va0 = VFMULVF_FLOAT(vres0, alpha, vl);
|
||||
VSEV_FLOAT(C0, va0, vl);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= vl; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*vl;
|
||||
ptrbb += temp*1;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += vl; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 += vl;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1;
|
||||
#endif
|
||||
|
||||
bb += (bk);
|
||||
C += (ldc);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,847 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSSEV_FLOAT vsse32_v_f32m2
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSSEV_FLOAT vsse64_v_f64m2
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_L
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
FLOAT *pa, *pc;
|
||||
|
||||
int i, j, k;
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
|
||||
|
||||
size_t vl;
|
||||
FLOAT_V_T va, vc;
|
||||
|
||||
a += (m - 1) * m;
|
||||
b += (m - 1) * n;
|
||||
|
||||
for (i = m - 1; i >= 0; i--)
|
||||
{
|
||||
aa = *(a + i);
|
||||
for (j = 0; j < n; j ++)
|
||||
{
|
||||
bb = *(c + i + j * ldc);
|
||||
bb *= aa;
|
||||
*b = bb;
|
||||
*(c + i + j * ldc) = bb;
|
||||
b ++;
|
||||
|
||||
pa = a;
|
||||
pc = c + j * ldc;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc = VLEV_FLOAT(pc, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
|
||||
VSEV_FLOAT(pc, vc, vl);
|
||||
pa += vl;
|
||||
pc += vl;
|
||||
}
|
||||
}
|
||||
a -= m;
|
||||
b -= 2 * n;
|
||||
}
|
||||
|
||||
}
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 2
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb0, bb1;
|
||||
FLOAT *pa, *pc, *pc0, *pc1;
|
||||
FLOAT *pb0, *pb1;
|
||||
|
||||
int i, j, k;
|
||||
fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
|
||||
|
||||
size_t vl;
|
||||
FLOAT_V_T va, vc0, vc1;
|
||||
|
||||
a += (m - 1) * m;
|
||||
b += (m - 1) * n;
|
||||
|
||||
for (i = m - 1; i >= 0; i--)
|
||||
{
|
||||
aa = *(a + i);
|
||||
pc = c + i;
|
||||
for (j = 0; j < n/2; j ++)
|
||||
{
|
||||
//bb = *(c + i + j * ldc);
|
||||
pb0 = pc + j * ldc * 2;
|
||||
pb1 = pb0 + ldc;
|
||||
//bb *= aa;
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
//*b = bb;
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
|
||||
//*(c + i + j * ldc) = bb;
|
||||
//b ++;
|
||||
|
||||
b += 2;
|
||||
//pa = a + i + 1;
|
||||
pc0 = c + j * ldc * 2;
|
||||
pc1 = pc0 + ldc;
|
||||
pa = a;
|
||||
//pc = c + j * ldc;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
}
|
||||
}
|
||||
pc += ldc * (n/2) * 2;
|
||||
if (n & 1)
|
||||
{
|
||||
pb0 = pc;
|
||||
bb0 = (*pb0) * aa;
|
||||
*b = bb0;
|
||||
*pb0 = bb0;
|
||||
b += 1;
|
||||
|
||||
pc0 = pc - i;
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
}
|
||||
}
|
||||
|
||||
a -= m;
|
||||
b -= 2 * n;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 4
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb0, bb1, bb2, bb3;
|
||||
FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3;
|
||||
FLOAT *pb0, *pb1, *pb2, *pb3;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
size_t vl;
|
||||
FLOAT_V_T va, vc0, vc1, vc2, vc3;
|
||||
|
||||
a += (m - 1) * m;
|
||||
b += (m - 1) * n;
|
||||
|
||||
for (i = m - 1; i >= 0; i--)
|
||||
{
|
||||
aa = *(a + i);
|
||||
pc = c + i;
|
||||
for (j = 0; j < n/4; j ++)
|
||||
{
|
||||
pb0 = pc + j * ldc * 4;
|
||||
pb1 = pb0 + ldc;
|
||||
pb2 = pb1 + ldc;
|
||||
pb3 = pb2 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
bb2 = (*pb2) * aa;
|
||||
bb3 = (*pb3) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*(b+2) = bb2;
|
||||
*(b+3) = bb3;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
*pb2 = bb2;
|
||||
*pb3 = bb3;
|
||||
|
||||
b += 4;
|
||||
|
||||
pc0 = c + j * ldc * 4;
|
||||
pc1 = pc0 + ldc;
|
||||
pc2 = pc1 + ldc;
|
||||
pc3 = pc2 + ldc;
|
||||
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
vc2 = VLEV_FLOAT(pc2, vl);
|
||||
vc3 = VLEV_FLOAT(pc3, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
VSEV_FLOAT(pc2, vc2, vl);
|
||||
VSEV_FLOAT(pc3, vc3, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
pc2 += vl;
|
||||
pc3 += vl;
|
||||
}
|
||||
}
|
||||
pc += ldc * (n/4) * 4;
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
pb0 = pc + j * ldc * 2;
|
||||
pb1 = pb0 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
|
||||
b += 2;
|
||||
|
||||
pc0 = c + j * ldc * 2;
|
||||
pc1 = pc0 + ldc;
|
||||
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
}
|
||||
pc += ldc * 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
pb0 = pc;
|
||||
bb0 = (*pb0) * aa;
|
||||
*b = bb0;
|
||||
*pb0 = bb0;
|
||||
b += 1;
|
||||
|
||||
pc0 = pc - i;
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
}
|
||||
}
|
||||
|
||||
a -= m;
|
||||
b -= 2 * n;
|
||||
}
|
||||
|
||||
}
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 8
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
|
||||
FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
|
||||
FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
size_t vl;
|
||||
FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
|
||||
|
||||
a += (m - 1) * m;
|
||||
b += (m - 1) * n;
|
||||
|
||||
for (i = m - 1; i >= 0; i--)
|
||||
{
|
||||
aa = *(a + i);
|
||||
pc = c + i;
|
||||
for (j = 0; j < n/8; j ++)
|
||||
{
|
||||
pb0 = pc + j * ldc * 8;
|
||||
pb1 = pb0 + ldc;
|
||||
pb2 = pb1 + ldc;
|
||||
pb3 = pb2 + ldc;
|
||||
pb4 = pb3 + ldc;
|
||||
pb5 = pb4 + ldc;
|
||||
pb6 = pb5 + ldc;
|
||||
pb7 = pb6 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
bb2 = (*pb2) * aa;
|
||||
bb3 = (*pb3) * aa;
|
||||
bb4 = (*pb4) * aa;
|
||||
bb5 = (*pb5) * aa;
|
||||
bb6 = (*pb6) * aa;
|
||||
bb7 = (*pb7) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*(b+2) = bb2;
|
||||
*(b+3) = bb3;
|
||||
*(b+4) = bb4;
|
||||
*(b+5) = bb5;
|
||||
*(b+6) = bb6;
|
||||
*(b+7) = bb7;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
*pb2 = bb2;
|
||||
*pb3 = bb3;
|
||||
*pb4 = bb4;
|
||||
*pb5 = bb5;
|
||||
*pb6 = bb6;
|
||||
*pb7 = bb7;
|
||||
|
||||
b += 8;
|
||||
|
||||
pc0 = c + j * ldc * 8;
|
||||
pc1 = pc0 + ldc;
|
||||
pc2 = pc1 + ldc;
|
||||
pc3 = pc2 + ldc;
|
||||
pc4 = pc3 + ldc;
|
||||
pc5 = pc4 + ldc;
|
||||
pc6 = pc5 + ldc;
|
||||
pc7 = pc6 + ldc;
|
||||
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
vc2 = VLEV_FLOAT(pc2, vl);
|
||||
vc3 = VLEV_FLOAT(pc3, vl);
|
||||
vc4 = VLEV_FLOAT(pc4, vl);
|
||||
vc5 = VLEV_FLOAT(pc5, vl);
|
||||
vc6 = VLEV_FLOAT(pc6, vl);
|
||||
vc7 = VLEV_FLOAT(pc7, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
|
||||
vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
|
||||
vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
|
||||
vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
|
||||
vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
VSEV_FLOAT(pc2, vc2, vl);
|
||||
VSEV_FLOAT(pc3, vc3, vl);
|
||||
VSEV_FLOAT(pc4, vc4, vl);
|
||||
VSEV_FLOAT(pc5, vc5, vl);
|
||||
VSEV_FLOAT(pc6, vc6, vl);
|
||||
VSEV_FLOAT(pc7, vc7, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
pc2 += vl;
|
||||
pc3 += vl;
|
||||
pc4 += vl;
|
||||
pc5 += vl;
|
||||
pc6 += vl;
|
||||
pc7 += vl;
|
||||
}
|
||||
}
|
||||
pc += ldc * (n/8) * 8;
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
pb0 = pc + j * ldc * 4;
|
||||
pb1 = pb0 + ldc;
|
||||
pb2 = pb1 + ldc;
|
||||
pb3 = pb2 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
bb2 = (*pb2) * aa;
|
||||
bb3 = (*pb3) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*(b+2) = bb2;
|
||||
*(b+3) = bb3;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
*pb2 = bb2;
|
||||
*pb3 = bb3;
|
||||
|
||||
b += 4;
|
||||
|
||||
pc0 = c + j * ldc * 4;
|
||||
pc1 = pc0 + ldc;
|
||||
pc2 = pc1 + ldc;
|
||||
pc3 = pc2 + ldc;
|
||||
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
vc2 = VLEV_FLOAT(pc2, vl);
|
||||
vc3 = VLEV_FLOAT(pc3, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
VSEV_FLOAT(pc2, vc2, vl);
|
||||
VSEV_FLOAT(pc3, vc3, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
pc2 += vl;
|
||||
pc3 += vl;
|
||||
}
|
||||
pc += ldc * 4;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
pb0 = pc + j * ldc * 2;
|
||||
pb1 = pb0 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
|
||||
b += 2;
|
||||
|
||||
pc0 = c + j * ldc * 2;
|
||||
pc1 = pc0 + ldc;
|
||||
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
}
|
||||
pc += ldc * 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
pb0 = pc;
|
||||
bb0 = (*pb0) * aa;
|
||||
*b = bb0;
|
||||
*pb0 = bb0;
|
||||
b += 1;
|
||||
|
||||
pc0 = pc - i;
|
||||
pa = a;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
}
|
||||
}
|
||||
|
||||
a -= m;
|
||||
b -= 2 * n;
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
a += (m - 1) * m;
|
||||
b += (m - 1) * n;
|
||||
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
|
||||
aa = *(a + i);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb = *(c + i + j * ldc);
|
||||
bb *= aa;
|
||||
*b = bb;
|
||||
*(c + i + j * ldc) = bb;
|
||||
b ++;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
*(c + k + j * ldc) -= bb * *(a + k);
|
||||
}
|
||||
|
||||
}
|
||||
a -= m;
|
||||
b -= 2 * n;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
a += (m - 1) * m * 2;
|
||||
b += (m - 1) * n * 2;
|
||||
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
|
||||
aa1 = *(a + i * 2 + 0);
|
||||
aa2 = *(a + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb1 = *(c + i * 2 + 0 + j * ldc);
|
||||
bb2 = *(c + i * 2 + 1 + j * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = aa1 * bb2 - aa2 * bb1;
|
||||
#endif
|
||||
|
||||
|
||||
*(b + 0) = cc1;
|
||||
*(b + 1) = cc2;
|
||||
*(c + i * 2 + 0 + j * ldc) = cc1;
|
||||
*(c + i * 2 + 1 + j * ldc) = cc2;
|
||||
b += 2;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#else
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
a -= m * 2;
|
||||
b -= 4 * n;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
BLASLONG i, j;
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
kk = m + offset;
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
aa = a + (m - i) * k * COMPSIZE;
|
||||
cc = c + (m - i) * COMPSIZE;
|
||||
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + (kk - i) * i * COMPSIZE,
|
||||
b + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
kk -= i;
|
||||
|
||||
}
|
||||
|
||||
int mod = i;
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
aa = a + (m - mod - vl) * k * COMPSIZE;
|
||||
cc = c + (m - mod - vl) * COMPSIZE;
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + (kk - vl) * vl * COMPSIZE,
|
||||
b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa -= vl * k * COMPSIZE;
|
||||
cc -= vl * COMPSIZE;
|
||||
kk -= vl;
|
||||
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
kk = m + offset;
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
aa = a + (m - i) * k * COMPSIZE;
|
||||
cc = c + (m - i) * COMPSIZE;
|
||||
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc, ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + (kk - i) * i * COMPSIZE,
|
||||
b + (kk - i) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
kk -= i;
|
||||
|
||||
}
|
||||
|
||||
int mod = i;
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
aa = a + (m - mod - vl) * k * COMPSIZE;
|
||||
cc = c + (m - mod - vl) * COMPSIZE;
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + (kk - vl) * vl * COMPSIZE,
|
||||
b + (kk - vl) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa -= vl * k * COMPSIZE;
|
||||
cc -= vl * COMPSIZE;
|
||||
kk -= vl;
|
||||
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,840 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSSEV_FLOAT vsse32_v_f32m2
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSSEV_FLOAT vsse64_v_f64m2
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_L
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
|
||||
{
|
||||
FLOAT aa, bb;
|
||||
FLOAT *pa, *pc;
|
||||
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T va, vc;
|
||||
for (i = 0; i < m; i++)
|
||||
{
|
||||
aa = *(a + i);
|
||||
for (j = 0; j < n; j ++)
|
||||
{
|
||||
bb = *(c + i + j * ldc);
|
||||
bb *= aa;
|
||||
*b = bb;
|
||||
*(c + i + j * ldc) = bb;
|
||||
b++;
|
||||
pa = a + i + 1;
|
||||
pc = c + j * ldc + i + 1;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc = VLEV_FLOAT(pc, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
|
||||
VSEV_FLOAT(pc, vc, vl);
|
||||
pa += vl;
|
||||
pc += vl;
|
||||
}
|
||||
}
|
||||
a += m;
|
||||
}
|
||||
}
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 2
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
|
||||
{
|
||||
|
||||
FLOAT aa, bb0, bb1;
|
||||
FLOAT *pa, *pc, *pc0, *pc1;
|
||||
FLOAT *pb0, *pb1;
|
||||
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T va, vc0, vc1;
|
||||
for (i = 0; i < m; i++)
|
||||
{
|
||||
aa = *(a + i);
|
||||
pc = c + i;
|
||||
for (j = 0; j < n/2; j ++)
|
||||
{
|
||||
pb0 = pc + j * ldc * 2;
|
||||
pb1 = pb0 + ldc;
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
b += 2;
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
pc1 = pc0 + ldc;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
}
|
||||
}
|
||||
pc += ldc * (n/2) * 2;
|
||||
if (n & 1)
|
||||
{
|
||||
pb0 = pc;
|
||||
bb0 = *(pb0);
|
||||
bb0 *= aa;
|
||||
*b = bb0;
|
||||
*(c + i) = bb0;
|
||||
b++;
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
}
|
||||
}
|
||||
|
||||
a += m;
|
||||
}
|
||||
}
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 4
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
|
||||
{
|
||||
|
||||
FLOAT aa, bb0, bb1, bb2, bb3;
|
||||
FLOAT *pa, *pc;
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3;
|
||||
FLOAT *pb0, *pb1, *pb2, *pb3;
|
||||
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T va;
|
||||
FLOAT_V_T vc0, vc1, vc2, vc3;
|
||||
for (i = 0; i < m; i++)
|
||||
{
|
||||
aa = *(a + i);
|
||||
pc = c + i;
|
||||
for (j = 0; j < n/4; j ++)
|
||||
{
|
||||
pb0 = pc;
|
||||
pb1 = pb0 + ldc;
|
||||
pb2 = pb1 + ldc;
|
||||
pb3 = pb2 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
bb2 = (*pb2) * aa;
|
||||
bb3 = (*pb3) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*(b+2) = bb2;
|
||||
*(b+3) = bb3;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
*pb2 = bb2;
|
||||
*pb3 = bb3;
|
||||
b += 4;
|
||||
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
pc1 = pc0 + ldc;
|
||||
pc2 = pc1 + ldc;
|
||||
pc3 = pc2 + ldc;
|
||||
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
vc2 = VLEV_FLOAT(pc2, vl);
|
||||
vc3 = VLEV_FLOAT(pc3, vl);
|
||||
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
|
||||
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
VSEV_FLOAT(pc2, vc2, vl);
|
||||
VSEV_FLOAT(pc3, vc3, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
pc2 += vl;
|
||||
pc3 += vl;
|
||||
}
|
||||
}
|
||||
pc += ldc * (n/4) * 4;
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
pb0 = pc;
|
||||
pb1 = pb0 + ldc;
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
b += 2;
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
pc1 = pc0 + ldc;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
}
|
||||
pc += ldc * 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
pb0 = pc;
|
||||
bb0 = *(pb0);
|
||||
bb0 *= aa;
|
||||
*b = bb0;
|
||||
*(c + i) = bb0;
|
||||
b++;
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
}
|
||||
}
|
||||
|
||||
a += m;
|
||||
}
|
||||
}
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 8
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
|
||||
{
|
||||
|
||||
FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
|
||||
FLOAT *pa, *pc;
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
|
||||
FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
|
||||
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T va;
|
||||
FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
|
||||
for (i = 0; i < m; i++)
|
||||
{
|
||||
aa = *(a + i);
|
||||
pc = c + i;
|
||||
for (j = 0; j < n/8; j ++)
|
||||
{
|
||||
pb0 = pc + j * ldc * 8;
|
||||
pb1 = pb0 + ldc;
|
||||
pb2 = pb1 + ldc;
|
||||
pb3 = pb2 + ldc;
|
||||
pb4 = pb3 + ldc;
|
||||
pb5 = pb4 + ldc;
|
||||
pb6 = pb5 + ldc;
|
||||
pb7 = pb6 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
bb2 = (*pb2) * aa;
|
||||
bb3 = (*pb3) * aa;
|
||||
bb4 = (*pb4) * aa;
|
||||
bb5 = (*pb5) * aa;
|
||||
bb6 = (*pb6) * aa;
|
||||
bb7 = (*pb7) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*(b+2) = bb2;
|
||||
*(b+3) = bb3;
|
||||
*(b+4) = bb4;
|
||||
*(b+5) = bb5;
|
||||
*(b+6) = bb6;
|
||||
*(b+7) = bb7;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
*pb2 = bb2;
|
||||
*pb3 = bb3;
|
||||
*pb4 = bb4;
|
||||
*pb5 = bb5;
|
||||
*pb6 = bb6;
|
||||
*pb7 = bb7;
|
||||
b += 8;
|
||||
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
pc1 = pc0 + ldc;
|
||||
pc2 = pc1 + ldc;
|
||||
pc3 = pc2 + ldc;
|
||||
pc4 = pc3 + ldc;
|
||||
pc5 = pc4 + ldc;
|
||||
pc6 = pc5 + ldc;
|
||||
pc7 = pc6 + ldc;
|
||||
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
vc2 = VLEV_FLOAT(pc2, vl);
|
||||
vc3 = VLEV_FLOAT(pc3, vl);
|
||||
vc4 = VLEV_FLOAT(pc4, vl);
|
||||
vc5 = VLEV_FLOAT(pc5, vl);
|
||||
vc6 = VLEV_FLOAT(pc6, vl);
|
||||
vc7 = VLEV_FLOAT(pc7, vl);
|
||||
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
|
||||
vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
|
||||
vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
|
||||
vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
|
||||
vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
|
||||
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
VSEV_FLOAT(pc2, vc2, vl);
|
||||
VSEV_FLOAT(pc3, vc3, vl);
|
||||
VSEV_FLOAT(pc4, vc4, vl);
|
||||
VSEV_FLOAT(pc5, vc5, vl);
|
||||
VSEV_FLOAT(pc6, vc6, vl);
|
||||
VSEV_FLOAT(pc7, vc7, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
pc2 += vl;
|
||||
pc3 += vl;
|
||||
pc4 += vl;
|
||||
pc5 += vl;
|
||||
pc6 += vl;
|
||||
pc7 += vl;
|
||||
}
|
||||
}
|
||||
pc += ldc * (n/8) * 8;
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
pb0 = pc;
|
||||
pb1 = pb0 + ldc;
|
||||
pb2 = pb1 + ldc;
|
||||
pb3 = pb2 + ldc;
|
||||
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
bb2 = (*pb2) * aa;
|
||||
bb3 = (*pb3) * aa;
|
||||
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*(b+2) = bb2;
|
||||
*(b+3) = bb3;
|
||||
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
*pb2 = bb2;
|
||||
*pb3 = bb3;
|
||||
b += 4;
|
||||
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
pc1 = pc0 + ldc;
|
||||
pc2 = pc1 + ldc;
|
||||
pc3 = pc2 + ldc;
|
||||
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
vc2 = VLEV_FLOAT(pc2, vl);
|
||||
vc3 = VLEV_FLOAT(pc3, vl);
|
||||
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
|
||||
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
VSEV_FLOAT(pc2, vc2, vl);
|
||||
VSEV_FLOAT(pc3, vc3, vl);
|
||||
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
pc2 += vl;
|
||||
pc3 += vl;
|
||||
}
|
||||
pc += ldc * 4;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
pb0 = pc;
|
||||
pb1 = pb0 + ldc;
|
||||
bb0 = (*pb0) * aa;
|
||||
bb1 = (*pb1) * aa;
|
||||
*b = bb0;
|
||||
*(b+1) = bb1;
|
||||
*pb0 = bb0;
|
||||
*pb1 = bb1;
|
||||
b += 2;
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
pc1 = pc0 + ldc;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
vc1 = VLEV_FLOAT(pc1, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
VSEV_FLOAT(pc1, vc1, vl);
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
pc1 += vl;
|
||||
}
|
||||
pc += ldc * 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
pb0 = pc;
|
||||
bb0 = *(pb0);
|
||||
bb0 *= aa;
|
||||
*b = bb0;
|
||||
*(c + i) = bb0;
|
||||
b++;
|
||||
pa = a + i + 1;
|
||||
pc0 = pb0 + 1;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLEV_FLOAT(pc0, vl);
|
||||
va = VLEV_FLOAT(pa, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
|
||||
VSEV_FLOAT(pc0, vc0, vl);
|
||||
pa += vl;
|
||||
pc0 += vl;
|
||||
}
|
||||
}
|
||||
|
||||
a += m;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
aa = *(a + i);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb = *(c + i + j * ldc);
|
||||
bb *= aa;
|
||||
*b = bb;
|
||||
*(c + i + j * ldc) = bb;
|
||||
b ++;
|
||||
|
||||
for (k = i + 1; k < m; k ++){
|
||||
*(c + k + j * ldc) -= bb * *(a + k);
|
||||
}
|
||||
|
||||
}
|
||||
a += m;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
aa1 = *(a + i * 2 + 0);
|
||||
aa2 = *(a + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb1 = *(c + i * 2 + 0 + j * ldc);
|
||||
bb2 = *(c + i * 2 + 1 + j * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = aa1 * bb2 - aa2 * bb1;
|
||||
#endif
|
||||
|
||||
*(b + 0) = cc1;
|
||||
*(b + 1) = cc2;
|
||||
*(c + i * 2 + 0 + j * ldc) = cc1;
|
||||
*(c + i * 2 + 1 + j * ldc) = cc2;
|
||||
b += 2;
|
||||
|
||||
for (k = i + 1; k < m; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#else
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
a += m * 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
FLOAT *pa, *pc;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
size_t vl;
|
||||
FLOAT_V_T va0, va1, vc0, vc1;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
aa1 = *(a + i * 2 + 0);
|
||||
aa2 = *(a + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb1 = *(c + i * 2 + 0 + j * ldc);
|
||||
bb2 = *(c + i * 2 + 1 + j * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = aa1 * bb2 - aa2 * bb1;
|
||||
#endif
|
||||
|
||||
*(b + 0) = cc1;
|
||||
*(b + 1) = cc2;
|
||||
*(c + i * 2 + 0 + j * ldc) = cc1;
|
||||
*(c + i * 2 + 1 + j * ldc) = cc2;
|
||||
b += 2;
|
||||
|
||||
pa = a + (i + 1) * 2;
|
||||
pc = c + j * ldc + (i + 1) * 2;
|
||||
for (k = (m - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG2_FLOAT(&va0, &va1, pa, vl);
|
||||
VLSEG2_FLOAT(&vc0, &vc1, pc, vl);
|
||||
#ifndef CONJ
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
|
||||
vc0 = VFMACCVF_FLOAT(vc0, cc2, va1);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
|
||||
#else
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1);
|
||||
vc1 = VFMACCVF_FLOAT(vc1, cc1, va1);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
|
||||
#endif
|
||||
VSSEG2_FLOAT(pc, vc0, vc1, vl);
|
||||
pa += vl * 2;
|
||||
pc += vl * 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
a += m * 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
BLASLONG i, j;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
while (i <= m) {
|
||||
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
kk += vl;
|
||||
i += vl;
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
kk += i;
|
||||
|
||||
}
|
||||
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
while (i <= m) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
kk += vl;
|
||||
i += vl;
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
kk += i;
|
||||
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,792 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSSEV_FLOAT vsse32_v_f32m2
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSSEV_FLOAT vsse64_v_f64m2
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_R
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
FLOAT *pb, *pc;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
bb = *(b + i);
|
||||
|
||||
for (j = 0; j < m; j ++)
|
||||
{
|
||||
aa = *(c + j + i * ldc);
|
||||
aa *= bb;
|
||||
*a = aa;
|
||||
*(c + j + i * ldc) = aa;
|
||||
a ++;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc = c + j + (i + 1) *ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc = VLSEV_FLOAT(pc, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
|
||||
VSSEV_FLOAT(pc, stride_ldc, vc, vl);
|
||||
pb += vl;
|
||||
pc ++;
|
||||
}
|
||||
}
|
||||
b += n;
|
||||
}
|
||||
}
|
||||
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 2
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa0, aa1, bb;
|
||||
FLOAT *pb, *pc;
|
||||
FLOAT *pa0, *pa1, *pc0, *pc1;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc0, vc1;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
bb = *(b + i);
|
||||
pc = c + i * ldc;
|
||||
for (j = 0; j < m/2; j ++)
|
||||
{
|
||||
pa0 = pc + j * 2;
|
||||
pa1 = pc + j * 2 + 1;
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
a += 2;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
pc1 = pa1 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
}
|
||||
}
|
||||
pc += (m/2)*2;
|
||||
if (m & 1)
|
||||
{
|
||||
pa0 = pc;
|
||||
aa0 = *pa0 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*a = aa0;
|
||||
a += 1;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
}
|
||||
}
|
||||
b += n;
|
||||
}
|
||||
}
|
||||
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 4
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT bb;
|
||||
FLOAT aa0, aa1, aa2, aa3;
|
||||
FLOAT *pb, *pc;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc0, vc1, vc2, vc3;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
bb = *(b + i);
|
||||
pc = c + i * ldc;
|
||||
for (j = 0; j < m/4; j ++)
|
||||
{
|
||||
pa0 = pc + j * 4;
|
||||
pa1 = pa0 + 1;
|
||||
pa2 = pa1 + 1;
|
||||
pa3 = pa2 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
aa2 = *pa2 * bb;
|
||||
aa3 = *pa3 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*pa2 = aa2;
|
||||
*pa3 = aa3;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
*(a + 2)= aa2;
|
||||
*(a + 3)= aa3;
|
||||
|
||||
a += 4;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
pc1 = pa1 + ldc;
|
||||
pc2 = pa2 + ldc;
|
||||
pc3 = pa3 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
|
||||
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
|
||||
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
pc2++;
|
||||
pc3++;
|
||||
}
|
||||
}
|
||||
pc += (m/4)*4;
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
pa0 = pc;
|
||||
pa1 = pa0 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
|
||||
a += 2;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
pc1 = pa1 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
}
|
||||
pc += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
pa0 = pc;
|
||||
aa0 = *pa0 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*a = aa0;
|
||||
a += 1;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
}
|
||||
}
|
||||
b += n;
|
||||
}
|
||||
}
|
||||
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 8
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT bb;
|
||||
FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
|
||||
FLOAT *pb, *pc;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
bb = *(b + i);
|
||||
pc = c + i * ldc;
|
||||
for (j = 0; j < m/8; j ++)
|
||||
{
|
||||
pa0 = pc + j * 8;
|
||||
pa1 = pa0 + 1;
|
||||
pa2 = pa1 + 1;
|
||||
pa3 = pa2 + 1;
|
||||
pa4 = pa3 + 1;
|
||||
pa5 = pa4 + 1;
|
||||
pa6 = pa5 + 1;
|
||||
pa7 = pa6 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
aa2 = *pa2 * bb;
|
||||
aa3 = *pa3 * bb;
|
||||
aa4 = *pa4 * bb;
|
||||
aa5 = *pa5 * bb;
|
||||
aa6 = *pa6 * bb;
|
||||
aa7 = *pa7 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*pa2 = aa2;
|
||||
*pa3 = aa3;
|
||||
*pa4 = aa4;
|
||||
*pa5 = aa5;
|
||||
*pa6 = aa6;
|
||||
*pa7 = aa7;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
*(a + 2)= aa2;
|
||||
*(a + 3)= aa3;
|
||||
*(a + 4)= aa4;
|
||||
*(a + 5)= aa5;
|
||||
*(a + 6)= aa6;
|
||||
*(a + 7)= aa7;
|
||||
|
||||
a += 8;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
pc1 = pa1 + ldc;
|
||||
pc2 = pa2 + ldc;
|
||||
pc3 = pa3 + ldc;
|
||||
pc4 = pa4 + ldc;
|
||||
pc5 = pa5 + ldc;
|
||||
pc6 = pa6 + ldc;
|
||||
pc7 = pa7 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
|
||||
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
|
||||
vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
|
||||
vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
|
||||
vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
|
||||
vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
|
||||
vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
|
||||
vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
|
||||
vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
|
||||
vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
|
||||
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
|
||||
VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
|
||||
VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
|
||||
VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
|
||||
VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
pc2++;
|
||||
pc3++;
|
||||
pc4++;
|
||||
pc5++;
|
||||
pc6++;
|
||||
pc7++;
|
||||
}
|
||||
}
|
||||
pc += (m/8)*8;
|
||||
|
||||
if (m & 4)
|
||||
{
|
||||
pa0 = pc;
|
||||
pa1 = pa0 + 1;
|
||||
pa2 = pa1 + 1;
|
||||
pa3 = pa2 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
aa2 = *pa2 * bb;
|
||||
aa3 = *pa3 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*pa2 = aa2;
|
||||
*pa3 = aa3;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
*(a + 2)= aa2;
|
||||
*(a + 3)= aa3;
|
||||
|
||||
a += 4;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
pc1 = pa1 + ldc;
|
||||
pc2 = pa2 + ldc;
|
||||
pc3 = pa3 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
|
||||
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
|
||||
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
pc2++;
|
||||
pc3++;
|
||||
}
|
||||
pc += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
pa0 = pc;
|
||||
pa1 = pa0 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
|
||||
a += 2;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
pc1 = pa1 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
}
|
||||
pc += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
pa0 = pc;
|
||||
aa0 = *pa0 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*a = aa0;
|
||||
a += 1;
|
||||
|
||||
pb = b + i + 1;
|
||||
pc0 = pa0 + ldc;
|
||||
for (k = (n - i - 1); k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
}
|
||||
}
|
||||
b += n;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
|
||||
bb = *(b + i);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
aa = *(c + j + i * ldc);
|
||||
aa *= bb;
|
||||
*a = aa;
|
||||
*(c + j + i * ldc) = aa;
|
||||
a ++;
|
||||
|
||||
for (k = i + 1; k < n; k ++){
|
||||
*(c + j + k * ldc) -= aa * *(b + k);
|
||||
}
|
||||
|
||||
}
|
||||
b += n;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
|
||||
bb1 = *(b + i * 2 + 0);
|
||||
bb2 = *(b + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
aa1 = *(c + j * 2 + 0 + i * ldc);
|
||||
aa2 = *(c + j * 2 + 1 + i * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = -aa1 * bb2 + aa2 * bb1;
|
||||
#endif
|
||||
|
||||
*(a + 0) = cc1;
|
||||
*(a + 1) = cc2;
|
||||
*(c + j * 2 + 0 + i * ldc) = cc1;
|
||||
*(c + j * 2 + 1 + i * ldc) = cc2;
|
||||
a += 2;
|
||||
|
||||
for (k = i + 1; k < n; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#else
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
b += n * 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
BLASLONG i, j;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
kk = -offset;
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
if (i <= m) {
|
||||
do {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
kk += GEMM_UNROLL_N;
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
|
||||
while (i <= m) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(vl, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + kk * vl * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
kk += j;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,828 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSSEV_FLOAT vsse32_v_f32m2
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSSEV_FLOAT vsse64_v_f64m2
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_R
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
FLOAT *pb, *pc;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc;
|
||||
|
||||
a += (n - 1) * m;
|
||||
b += (n - 1) * n;
|
||||
|
||||
for (i = n - 1; i >= 0; i--) {
|
||||
|
||||
bb = *(b + i);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
aa = *(c + j + i * ldc);
|
||||
aa *= bb;
|
||||
*a = aa;
|
||||
*(c + j + i * ldc) = aa;
|
||||
a ++;
|
||||
|
||||
pb = b;
|
||||
pc = c + j;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc = VLSEV_FLOAT(pc, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
|
||||
VSSEV_FLOAT(pc, stride_ldc, vc, vl);
|
||||
pb += vl;
|
||||
pc++;
|
||||
}
|
||||
}
|
||||
b -= n;
|
||||
a -= 2 * m;
|
||||
}
|
||||
|
||||
}
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 2
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa0, aa1, bb;
|
||||
FLOAT *pb, *pc;
|
||||
FLOAT *pa0, *pa1, *pc0, *pc1;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc0, vc1;
|
||||
|
||||
a += (n - 1) * m;
|
||||
b += (n - 1) * n;
|
||||
|
||||
for (i = n - 1; i >= 0; i--)
|
||||
{
|
||||
bb = *(b + i);
|
||||
pc = c + i * ldc;
|
||||
for (j = 0; j < m/2; j ++)
|
||||
{
|
||||
pa0 = pc + j * 2;
|
||||
pa1 = pc + j * 2 + 1;
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
a += 2;
|
||||
|
||||
pb = b;
|
||||
pc0 = c + j * 2;
|
||||
pc1 = pc0 + 1;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
}
|
||||
}
|
||||
pc += (m/2)*2;
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
pa0 = pc;
|
||||
aa0 = *pa0 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*a = aa0;
|
||||
a += 1;
|
||||
|
||||
pb = b;
|
||||
pc0 = pc - i * ldc;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
}
|
||||
}
|
||||
b -= n;
|
||||
a -= 2 * m;
|
||||
}
|
||||
}
|
||||
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 4
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa0, aa1, aa2, aa3;
|
||||
FLOAT bb;
|
||||
FLOAT *pb, *pc;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc0, vc1, vc2, vc3;
|
||||
|
||||
a += (n - 1) * m;
|
||||
b += (n - 1) * n;
|
||||
|
||||
for (i = n - 1; i >= 0; i--)
|
||||
{
|
||||
bb = *(b + i);
|
||||
pc = c + i * ldc;
|
||||
for (j = 0; j < m/4; j ++)
|
||||
{
|
||||
pa0 = pc + j * 4;
|
||||
pa1 = pa0 + 1;
|
||||
pa2 = pa1 + 1;
|
||||
pa3 = pa2 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
aa2 = *pa2 * bb;
|
||||
aa3 = *pa3 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*pa2 = aa2;
|
||||
*pa3 = aa3;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
*(a + 2)= aa2;
|
||||
*(a + 3)= aa3;
|
||||
a += 4;
|
||||
|
||||
pb = b;
|
||||
pc0 = c + j * 4;
|
||||
pc1 = pc0 + 1;
|
||||
pc2 = pc1 + 1;
|
||||
pc3 = pc2 + 1;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
|
||||
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
|
||||
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
pc2++;
|
||||
pc3++;
|
||||
}
|
||||
}
|
||||
pc += (m/4)*4;
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
pa0 = pc + j * 2;
|
||||
pa1 = pa0 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
a += 2;
|
||||
|
||||
pb = b;
|
||||
pc0 = c + j * 4;
|
||||
pc1 = pc0 + 1;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
}
|
||||
pc += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
pa0 = pc;
|
||||
aa0 = *pa0 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*a = aa0;
|
||||
a += 1;
|
||||
|
||||
pb = b;
|
||||
pc0 = pc - i * ldc;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
}
|
||||
}
|
||||
b -= n;
|
||||
a -= 2 * m;
|
||||
}
|
||||
}
|
||||
#elif GEMM_DEFAULT_UNROLL_N == 8
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
|
||||
FLOAT bb;
|
||||
FLOAT *pb, *pc;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
|
||||
BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
|
||||
int i, j, k;
|
||||
size_t vl;
|
||||
FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
|
||||
|
||||
a += (n - 1) * m;
|
||||
b += (n - 1) * n;
|
||||
|
||||
for (i = n - 1; i >= 0; i--)
|
||||
{
|
||||
bb = *(b + i);
|
||||
pc = c + i * ldc;
|
||||
for (j = 0; j < m/8; j ++)
|
||||
{
|
||||
pa0 = pc + j * 8;
|
||||
pa1 = pa0 + 1;
|
||||
pa2 = pa1 + 1;
|
||||
pa3 = pa2 + 1;
|
||||
pa4 = pa3 + 1;
|
||||
pa5 = pa4 + 1;
|
||||
pa6 = pa5 + 1;
|
||||
pa7 = pa6 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
aa2 = *pa2 * bb;
|
||||
aa3 = *pa3 * bb;
|
||||
aa4 = *pa4 * bb;
|
||||
aa5 = *pa5 * bb;
|
||||
aa6 = *pa6 * bb;
|
||||
aa7 = *pa7 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*pa2 = aa2;
|
||||
*pa3 = aa3;
|
||||
*pa4 = aa4;
|
||||
*pa5 = aa5;
|
||||
*pa6 = aa6;
|
||||
*pa7 = aa7;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
*(a + 2)= aa2;
|
||||
*(a + 3)= aa3;
|
||||
*(a + 4)= aa4;
|
||||
*(a + 5)= aa5;
|
||||
*(a + 6)= aa6;
|
||||
*(a + 7)= aa7;
|
||||
a += 8;
|
||||
|
||||
pb = b;
|
||||
pc0 = c + j * 8;
|
||||
pc1 = pc0 + 1;
|
||||
pc2 = pc1 + 1;
|
||||
pc3 = pc2 + 1;
|
||||
pc4 = pc3 + 1;
|
||||
pc5 = pc4 + 1;
|
||||
pc6 = pc5 + 1;
|
||||
pc7 = pc6 + 1;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
|
||||
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
|
||||
vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
|
||||
vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
|
||||
vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
|
||||
vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
|
||||
vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
|
||||
vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
|
||||
vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
|
||||
vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
|
||||
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
|
||||
VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
|
||||
VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
|
||||
VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
|
||||
VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
pc2++;
|
||||
pc3++;
|
||||
pc4++;
|
||||
pc5++;
|
||||
pc6++;
|
||||
pc7++;
|
||||
}
|
||||
}
|
||||
pc += (m/8)*8;
|
||||
|
||||
if (m & 4)
|
||||
{
|
||||
pa0 = pc;
|
||||
pa1 = pa0 + 1;
|
||||
pa2 = pa1 + 1;
|
||||
pa3 = pa2 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
aa2 = *pa2 * bb;
|
||||
aa3 = *pa3 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
*pa2 = aa2;
|
||||
*pa3 = aa3;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
*(a + 2)= aa2;
|
||||
*(a + 3)= aa3;
|
||||
a += 4;
|
||||
|
||||
pb = b;
|
||||
pc0 = pc - i * ldc;
|
||||
pc1 = pc0 + 1;
|
||||
pc2 = pc1 + 1;
|
||||
pc3 = pc2 + 1;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
|
||||
vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
|
||||
vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
|
||||
VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
pc2++;
|
||||
pc3++;
|
||||
}
|
||||
pc += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
pa0 = pc;
|
||||
pa1 = pa0 + 1;
|
||||
|
||||
aa0 = *pa0 * bb;
|
||||
aa1 = *pa1 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*pa1 = aa1;
|
||||
|
||||
*a = aa0;
|
||||
*(a + 1)= aa1;
|
||||
a += 2;
|
||||
|
||||
pb = b;
|
||||
pc0 = pc - i * ldc;
|
||||
pc1 = pc0 + 1;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
|
||||
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
|
||||
|
||||
pb += vl;
|
||||
pc0++;
|
||||
pc1++;
|
||||
}
|
||||
pc += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
pa0 = pc;
|
||||
aa0 = *pa0 * bb;
|
||||
|
||||
*pa0 = aa0;
|
||||
*a = aa0;
|
||||
a += 1;
|
||||
|
||||
pb = b;
|
||||
pc0 = pc - i * ldc;
|
||||
for (k = i; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
|
||||
vb = VLEV_FLOAT(pb, vl);
|
||||
vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
|
||||
VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
|
||||
pb += vl;
|
||||
pc0++;
|
||||
}
|
||||
}
|
||||
b -= n;
|
||||
a -= 2 * m;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
a += (n - 1) * m;
|
||||
b += (n - 1) * n;
|
||||
|
||||
for (i = n - 1; i >= 0; i--) {
|
||||
|
||||
bb = *(b + i);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
aa = *(c + j + i * ldc);
|
||||
aa *= bb;
|
||||
*a = aa;
|
||||
*(c + j + i * ldc) = aa;
|
||||
a ++;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
*(c + j + k * ldc) -= aa * *(b + k);
|
||||
}
|
||||
|
||||
}
|
||||
b -= n;
|
||||
a -= 2 * m;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
a += (n - 1) * m * 2;
|
||||
b += (n - 1) * n * 2;
|
||||
|
||||
for (i = n - 1; i >= 0; i--) {
|
||||
|
||||
bb1 = *(b + i * 2 + 0);
|
||||
bb2 = *(b + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
|
||||
aa1 = *(c + j * 2 + 0 + i * ldc);
|
||||
aa2 = *(c + j * 2 + 1 + i * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = - aa1 * bb2 + aa2 * bb1;
|
||||
#endif
|
||||
|
||||
*(a + 0) = cc1;
|
||||
*(a + 1) = cc2;
|
||||
|
||||
*(c + j * 2 + 0 + i * ldc) = cc1;
|
||||
*(c + j * 2 + 1 + i * ldc) = cc2;
|
||||
a += 2;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#else
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
b -= n * 2;
|
||||
a -= 4 * m;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
BLASLONG i, j;
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
|
||||
size_t vl = VSETVL_MAX;
|
||||
|
||||
//fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
|
||||
|
||||
kk = n - offset;
|
||||
c += n * ldc * COMPSIZE;
|
||||
b += n * k * COMPSIZE;
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = 1;
|
||||
while (j < GEMM_UNROLL_N) {
|
||||
if (n & j) {
|
||||
|
||||
aa = a;
|
||||
b -= j * k * COMPSIZE;
|
||||
c -= j * ldc* COMPSIZE;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, j,
|
||||
aa + (kk - j) * vl * COMPSIZE,
|
||||
b + (kk - j) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc, ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + (kk - j) * i * COMPSIZE,
|
||||
b + (kk - j) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
kk -= j;
|
||||
}
|
||||
j <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
if (j > 0) {
|
||||
|
||||
do {
|
||||
aa = a;
|
||||
b -= GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c -= GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
cc = c;
|
||||
|
||||
i = vl;
|
||||
if (i <= m) {
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + vl * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(vl, GEMM_UNROLL_N,
|
||||
aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE,
|
||||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += vl * k * COMPSIZE;
|
||||
cc += vl * COMPSIZE;
|
||||
i += vl;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
i = m % vl;
|
||||
if (i) {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE,
|
||||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
kk -= GEMM_UNROLL_N;
|
||||
j --;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT_V_T va1;
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
|
||||
*(b + j) = INV(*(ao + j * lda));
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii > jj)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao++;
|
||||
b += vl;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl * lda;
|
||||
jj += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT vid_v_u32m2
|
||||
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
|
||||
#endif
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
|
||||
FLOAT_V_T va1;
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
*(b + j) = INV(*(ao + j));
|
||||
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
|
||||
b += vl;
|
||||
ao += lda;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii < jj)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao += lda;
|
||||
b += vl;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl;
|
||||
jj += vl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT vid_v_u32m2
|
||||
#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
BLASLONG stride_lda = sizeof(FLOAT)*lda;
|
||||
|
||||
FLOAT *ao;
|
||||
jj = offset;
|
||||
|
||||
FLOAT_V_T va1;
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
*(b + j) = INV(*(ao + j * lda));
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
ao++;
|
||||
b += vl;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii < jj)
|
||||
{
|
||||
va1 = VLSEV_FLOAT(ao, stride_lda, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao++;
|
||||
b += vl;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl * lda;
|
||||
jj += vl;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSEV_FLOAT_M vse32_v_f32m2_m
|
||||
#define VLSEV_FLOAT vlse32_v_f32m2
|
||||
#define VBOOL_T vbool16_t
|
||||
#define UINT_V_T vuint32m2_t
|
||||
#define VID_V_UINT vid_v_u32m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSEV_FLOAT_M vse64_v_f64m2_m
|
||||
#define VLSEV_FLOAT vlse64_v_f64m2
|
||||
#define VBOOL_T vbool32_t
|
||||
#define UINT_V_T vuint64m2_t
|
||||
#define VID_V_UINT vid_v_u64m2
|
||||
#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj, js;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
FLOAT_V_T va1;
|
||||
|
||||
VBOOL_T vbool_cmp;
|
||||
UINT_V_T vindex;
|
||||
|
||||
size_t vl;
|
||||
|
||||
for (js = n; js > 0; js -= vl)
|
||||
{
|
||||
vl = VSETVL(js);
|
||||
ao = a;
|
||||
|
||||
ii = 0;
|
||||
for (i = 0; i < m;)
|
||||
{
|
||||
|
||||
if (ii == jj)
|
||||
{
|
||||
vindex = VID_V_UINT(vl);
|
||||
for (unsigned int j = 0; j < vl; j++)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
|
||||
VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
|
||||
*(b + j) = INV(*(ao + j));
|
||||
|
||||
ao += lda;
|
||||
b += vl;
|
||||
}
|
||||
i += vl;
|
||||
ii += vl;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ii > jj)
|
||||
{
|
||||
va1 = VLEV_FLOAT(ao, vl);
|
||||
VSEV_FLOAT(b, va1, vl);
|
||||
}
|
||||
ao += lda;
|
||||
b += vl;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
a += vl;
|
||||
jj += vl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f32m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m4
|
||||
#define VFABSV_FLOAT vfabs_v_f32m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMAXVV_FLOAT vfmax_vv_f64m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m4
|
||||
#define VFABSV_FLOAT vfabs_v_f64m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
FLOAT_V_T v0, v1, vmax;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmax = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&v0, &v1, x, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmax = VFMAXVV_FLOAT(vmax, v0, vl);
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmax = VFMAXVV_FLOAT(vmax, v0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
|
||||
maxf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(maxf);
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <float.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f32m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m4
|
||||
#define VFABSV_FLOAT vfabs_v_f32m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMINVV_FLOAT vfmin_vv_f64m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m4
|
||||
#define VFABSV_FLOAT vfabs_v_f64m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
FLOAT_V_T v0, v1, vmin;
|
||||
FLOAT_V_T_M1 v_res;
|
||||
|
||||
v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&v0, &v1, x, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmin = VFMINVV_FLOAT(vmin, v0, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v0 = VFADDVV_FLOAT(v0, v1, vl);
|
||||
vmin = VFMINVV_FLOAT(vmin, v0, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
|
||||
minf = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(minf);
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m8()
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m8
|
||||
#define VLSEV_FLOAT vlse32_v_f32m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m8
|
||||
#define VFABSV_FLOAT vfabs_v_f32m8
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m8(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m8()
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m8
|
||||
#define VLSEV_FLOAT vlse64_v_f64m8
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m8
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m8
|
||||
#define VFABSV_FLOAT vfabs_v_f64m8
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT asumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(asumf);
|
||||
|
||||
FLOAT_V_T v0, v1;
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLEV_FLOAT(x, vl);
|
||||
v1 = VLEV_FLOAT(x+vl, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
|
||||
int stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
v0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
v1 = VLSEV_FLOAT(x+1, stride_x, vl);
|
||||
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
|
||||
asumf += VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(asumf);
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/***************************************************************************
|
||||
* 2014/06/07 Saar
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f32m4
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT vsseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFMSACVF_FLOAT vfmsac_vf_f64m4
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT vsseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
BLASLONG stride_x = inc_x2 * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y2 * sizeof(FLOAT);
|
||||
FLOAT_V_T vx0, vx1, vy0, vy1;
|
||||
|
||||
if ( beta_r == 0.0 && beta_i == 0.0)
|
||||
{
|
||||
if ( alpha_r == 0.0 && alpha_i == 0.0 )
|
||||
{
|
||||
size_t vl = VSETVL(n);
|
||||
FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl);
|
||||
for ( ; n > 0; n -= vl, y += vl*stride_y)
|
||||
{
|
||||
vl = VSETVL(n);
|
||||
VSSSEG_FLOAT(y, stride_y, temp, temp, vl);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2)
|
||||
{
|
||||
vl = VSETVL(n);
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
|
||||
vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl);
|
||||
vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl);
|
||||
|
||||
vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl);
|
||||
|
||||
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
FLOAT_V_T v0, v1;
|
||||
|
||||
if ( alpha_r == 0.0 && alpha_i == 0.0 )
|
||||
{
|
||||
for (size_t vl; n > 0; n -= vl, y += vl*inc_y2)
|
||||
{
|
||||
vl = VSETVL(n);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vy1, beta_i, vl);
|
||||
v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vy1, beta_r, vl);
|
||||
v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
|
||||
|
||||
VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2)
|
||||
{
|
||||
vl = VSETVL(n);
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
v0 = VFMULVF_FLOAT(vx0, alpha_r, vl);
|
||||
v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl);
|
||||
v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl);
|
||||
v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
|
||||
v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl);
|
||||
v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl);
|
||||
v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
|
||||
|
||||
VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT vsseg2e32_v_f32m4
|
||||
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT vsseg2e64_v_f64m4
|
||||
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
if(n < 0) return(0);
|
||||
if(da_r == 0.0 && da_i == 0.0) return(0);
|
||||
|
||||
FLOAT_V_T vx0, vx1, vy0, vy1;
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
#if !defined(CONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#endif
|
||||
VSSEG_FLOAT(y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else if (inc_x == 1) {
|
||||
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
#if !defined(CONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#endif
|
||||
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else if (inc_y == 1) {
|
||||
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
#if !defined(CONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#endif
|
||||
VSSEG_FLOAT(y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
#if !defined(CONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
|
||||
#endif
|
||||
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL_M8(n) vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T_M8 vfloat32m8_t
|
||||
#define VLEV_FLOAT_M8 vle32_v_f32m8
|
||||
#define VSEV_FLOAT_M8 vse32_v_f32m8
|
||||
|
||||
#define VSETVL_M4(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T_M4 vfloat32m4_t
|
||||
#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4
|
||||
#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4
|
||||
#else
|
||||
#define VSETVL_M8(n) vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T_M8 vfloat64m8_t
|
||||
#define VLEV_FLOAT_M8 vle64_v_f64m8
|
||||
#define VSEV_FLOAT_M8 vse64_v_f64m8
|
||||
|
||||
#define VSETVL_M4(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T_M4 vfloat64m4_t
|
||||
#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4
|
||||
#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
if(n < 0) return(0);
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
FLOAT_V_T_M8 vx;
|
||||
n *= 2; // convert to words
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
|
||||
vl = VSETVL_M8(n);
|
||||
vx = VLEV_FLOAT_M8(x, vl);
|
||||
VSEV_FLOAT_M8(y, vx, vl);
|
||||
}
|
||||
|
||||
}else if (1 == inc_x) {
|
||||
|
||||
FLOAT_V_T_M4 vr, vi;
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL_M4(n);
|
||||
VLSEG_FLOAT_M4(&vr, &vi, x, vl);
|
||||
VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
|
||||
}
|
||||
} else if (1 == inc_y) {
|
||||
|
||||
FLOAT_V_T_M4 vr, vi;
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
|
||||
vl = VSETVL_M4(n);
|
||||
VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
|
||||
VSSEG_FLOAT_M4(y, vr, vi, vl);
|
||||
}
|
||||
} else {
|
||||
|
||||
FLOAT_V_T_M4 vr, vi;
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL_M4(n);
|
||||
VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
|
||||
VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMULVV_FLOAT vfmul_vv_f32m4
|
||||
#define VFMSACVV_FLOAT vfmsac_vv_f32m4
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMULVV_FLOAT vfmul_vv_f64m4
|
||||
#define VFMSACVV_FLOAT vfmsac_vv_f64m4
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
CREAL(result) = 0.0;
|
||||
CIMAG(result) = 0.0;
|
||||
|
||||
if ( n <= 0 ) return(result);
|
||||
|
||||
FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
size_t vlmax_m1 = VSETVL_MAX_M1;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax_m1);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1);
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
vr0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vr1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
|
||||
#if !defined(CONJ)
|
||||
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#else
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#endif
|
||||
}
|
||||
|
||||
} else if (inc_x == 1){
|
||||
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
|
||||
#if !defined(CONJ)
|
||||
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#else
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#endif
|
||||
}
|
||||
} else if (inc_y == 1){
|
||||
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
|
||||
#if !defined(CONJ)
|
||||
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#else
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#endif
|
||||
}
|
||||
}else {
|
||||
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
|
||||
#if !defined(CONJ)
|
||||
vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#else
|
||||
vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
|
||||
vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax);
|
||||
CREAL(result) = VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax);
|
||||
CIMAG(result) = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(result);
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT vsseg2e32_v_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m4
|
||||
#define VFSUBVV_FLOAT vfsub_vv_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT vsseg2e64_v_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m4
|
||||
#define VFSUBVV_FLOAT vfsub_vv_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
|
||||
FLOAT beta_r, FLOAT beta_i,
|
||||
FLOAT *dummy2, BLASLONG dummy3,
|
||||
FLOAT *dummy4, BLASLONG dummy5,
|
||||
FLOAT *c, BLASLONG ldc)
|
||||
{
|
||||
BLASLONG chunk;
|
||||
FLOAT *c_offset;
|
||||
size_t vl;
|
||||
FLOAT_V_T vr, vi, v1, v2, v3, v4;
|
||||
|
||||
ldc *= 2;
|
||||
c_offset = c;
|
||||
|
||||
if (beta_r == 0.0 && beta_i == 0.0) {
|
||||
|
||||
vl = VSETVL(m);
|
||||
vr = VFMVVF_FLOAT(0.0, vl);
|
||||
vi = VFMVVF_FLOAT(0.0, vl);
|
||||
|
||||
for( ; n > 0; n--, c += ldc) {
|
||||
c_offset = c;
|
||||
|
||||
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
|
||||
vl = VSETVL(chunk);
|
||||
|
||||
VSSEG_FLOAT(c_offset, vr, vi, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for( ; n > 0; n--, c += ldc) {
|
||||
c_offset = c;
|
||||
|
||||
for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
|
||||
vl = VSETVL(chunk);
|
||||
|
||||
VLSEG_FLOAT(&vr, &vi, c_offset, vl);
|
||||
|
||||
v1 = VFMULVF_FLOAT(vr, beta_r, vl);
|
||||
v2 = VFMULVF_FLOAT(vi, beta_i, vl);
|
||||
|
||||
v3 = VFMULVF_FLOAT(vi, beta_r, vl);
|
||||
v4 = VFMULVF_FLOAT(vr, beta_i, vl);
|
||||
|
||||
vr = VFSUBVV_FLOAT(v1, v2, vl);
|
||||
vi = VFADDVV_FLOAT(v3, v4, vl);
|
||||
|
||||
VSSEG_FLOAT(c_offset, vr, vi, vl);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT vsseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT vsseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT temp_r, temp_i;
|
||||
FLOAT_V_T va0, va1, vy0, vy1;
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
|
||||
|
||||
BLASLONG inc_x2 = inc_x * 2;
|
||||
BLASLONG lda2 = lda * 2;
|
||||
if (inc_y == 1)
|
||||
{
|
||||
for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) {
|
||||
vl = VSETVL(m);
|
||||
a_ptr = a;
|
||||
ix = 0;
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
for(i = 0; i < n; i++){
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
|
||||
#else
|
||||
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
|
||||
#endif
|
||||
|
||||
VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#endif
|
||||
#endif
|
||||
a_ptr += lda2;
|
||||
ix += inc_x2;
|
||||
}
|
||||
VSSEG_FLOAT(y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(m);
|
||||
a_ptr = a;
|
||||
ix = 0;
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
for(i = 0; i < n; i++){
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
|
||||
#else
|
||||
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
|
||||
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
|
||||
#endif
|
||||
|
||||
VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#else
|
||||
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
|
||||
#endif
|
||||
#endif
|
||||
a_ptr += lda2;
|
||||
ix += inc_x2;
|
||||
}
|
||||
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMULVV_FLOAT vfmul_vv_f32m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMULVV_FLOAT vfmul_vv_f64m4
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i = 0, j = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT *a_ptr = a;
|
||||
FLOAT temp_r, temp_i;
|
||||
|
||||
FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
//BLASLONG stride_a = sizeof(FLOAT) * 2;
|
||||
BLASLONG inc_y2 = inc_y * 2;
|
||||
BLASLONG lda2 = lda * 2;
|
||||
|
||||
size_t vlmax = VSETVL_MAX_M1;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
vlmax = VSETVL(m);
|
||||
|
||||
if (inc_x == 1)
|
||||
{
|
||||
for(i = 0; i < n; i++) {
|
||||
j = 0;
|
||||
ix = 0;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
vi = VFMVVF_FLOAT(0, vlmax);
|
||||
for(size_t vl, k = m; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
|
||||
VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl);
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
|
||||
vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
|
||||
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
|
||||
vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
|
||||
#else
|
||||
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
|
||||
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
|
||||
vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
|
||||
#endif
|
||||
j += vl * 2;
|
||||
ix += vl * inc_x * 2;
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
temp_r = VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
|
||||
temp_i = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
iy += inc_y2;
|
||||
a_ptr += lda2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0; i < n; i++) {
|
||||
j = 0;
|
||||
ix = 0;
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
vi = VFMVVF_FLOAT(0, vlmax);
|
||||
for(size_t vl, k = m; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
|
||||
VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl);
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
|
||||
vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
|
||||
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
|
||||
vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
|
||||
#else
|
||||
vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
|
||||
vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
|
||||
vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
|
||||
#endif
|
||||
j += vl * 2;
|
||||
ix += vl * inc_x * 2;
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
|
||||
temp_r = VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
|
||||
temp_i = VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
iy += inc_y2;
|
||||
a_ptr += lda2;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VFABSV_FLOAT vfabs_v_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VFABSV_FLOAT vfabs_v_f64m4
|
||||
#endif
|
||||
|
||||
// TODO: Should single precision use the widening MAC, or perhaps all should be double?
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
|
||||
if ( n <= 0 ) return(0.0);
|
||||
|
||||
FLOAT_V_T vr, v0, v1;
|
||||
FLOAT_V_T_M1 v_max, v_res;
|
||||
FLOAT scale = 0.0, ssq = 0.0;
|
||||
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_max = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
|
||||
vr = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&v0, &v1, x, vl);
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
|
||||
|
||||
v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
|
||||
v0 = VFABSV_FLOAT(v0, vl);
|
||||
v1 = VFABSV_FLOAT(v1, vl);
|
||||
|
||||
v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
|
||||
|
||||
v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
|
||||
vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
|
||||
|
||||
ssq = VFMVFS_FLOAT_M1(v_res);
|
||||
scale = VFMVFS_FLOAT_M1(v_max);
|
||||
ssq = ssq / (scale*scale);
|
||||
|
||||
return(scale * sqrt(ssq));
|
||||
}
|
|
@ -0,0 +1,181 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEV_FLOAT vlse32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEV_FLOAT vsse32_v_f32m4
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT vsseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEV_FLOAT vlse64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEV_FLOAT vsse64_v_f64m4
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT vsseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
|
||||
if (n <= 0) return(0);
|
||||
|
||||
FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
|
||||
|
||||
if (inc_x == 0 && inc_y == 0) {
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
inc_x2 = 2 * inc_x ;
|
||||
inc_y2 = 2 * inc_y ;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[iy] ;
|
||||
temp[1] = c*x[ix+1] + s*y[iy+1] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
vt0 = VFMULVF_FLOAT(vx0, c, vl);
|
||||
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
|
||||
vt1 = VFMULVF_FLOAT(vx1, c, vl);
|
||||
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
|
||||
vy0 = VFMULVF_FLOAT(vy0, c, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
|
||||
vy1 = VFMULVF_FLOAT(vy1, c, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
|
||||
|
||||
VSSEG_FLOAT(x, vt0, vt1, vl);
|
||||
VSSEG_FLOAT(y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else if (inc_x == 1){
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
vt0 = VFMULVF_FLOAT(vx0, c, vl);
|
||||
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
|
||||
vt1 = VFMULVF_FLOAT(vx1, c, vl);
|
||||
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
|
||||
vy0 = VFMULVF_FLOAT(vy0, c, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
|
||||
vy1 = VFMULVF_FLOAT(vy1, c, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
|
||||
|
||||
VSSEG_FLOAT(x, vt0, vt1, vl);
|
||||
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else if (inc_y == 1){
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
vt0 = VFMULVF_FLOAT(vx0, c, vl);
|
||||
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
|
||||
vt1 = VFMULVF_FLOAT(vx1, c, vl);
|
||||
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
|
||||
vy0 = VFMULVF_FLOAT(vy0, c, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
|
||||
vy1 = VFMULVF_FLOAT(vy1, c, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
|
||||
|
||||
VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
|
||||
VSSEG_FLOAT(y, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
vt0 = VFMULVF_FLOAT(vx0, c, vl);
|
||||
vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
|
||||
vt1 = VFMULVF_FLOAT(vx1, c, vl);
|
||||
vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
|
||||
vy0 = VFMULVF_FLOAT(vy0, c, vl);
|
||||
vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
|
||||
vy1 = VFMULVF_FLOAT(vy1, c, vl);
|
||||
vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
|
||||
|
||||
VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
|
||||
VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT vsseg2e32_v_f32m4
|
||||
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f32m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT vsseg2e64_v_f64m4
|
||||
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMULVF_FLOAT vfmul_vf_f64m4
|
||||
#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
|
||||
if((n <= 0) || (inc_x <= 0)) return(0);
|
||||
|
||||
FLOAT_V_T vt, vr, vi;
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
|
||||
if(da_r == 0.0 && da_i == 0.0) {
|
||||
|
||||
vr = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vi = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VSSEG_FLOAT(x, vr, vi, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
|
||||
}
|
||||
}
|
||||
|
||||
} else if(da_r == 0.0) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
|
||||
|
||||
vt = VFMULVF_FLOAT(vi, -da_i, vl);
|
||||
vi = VFMULVF_FLOAT(vr, da_i, vl);
|
||||
|
||||
VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
|
||||
}
|
||||
|
||||
} else if(da_i == 0.0) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
|
||||
|
||||
vr = VFMULVF_FLOAT(vr, da_r, vl);
|
||||
vi = VFMULVF_FLOAT(vi, da_r, vl);
|
||||
|
||||
VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vr, &vi, x, vl);
|
||||
|
||||
vt = VFMULVF_FLOAT(vr, da_r, vl);
|
||||
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
|
||||
vi = VFMULVF_FLOAT(vi, da_r, vl);
|
||||
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
|
||||
|
||||
VSSEG_FLOAT(x, vt, vi, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
|
||||
|
||||
vt = VFMULVF_FLOAT(vr, da_r, vl);
|
||||
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
|
||||
vi = VFMULVF_FLOAT(vi, da_r, vl);
|
||||
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
|
||||
|
||||
VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#define VFADDVV_FLOAT vfadd_vv_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#define VFADDVV_FLOAT vfadd_vv_f64m4
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
FLOAT_V_T v0, v1;
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
if(inc_x == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&v0, &v1, x, vl);
|
||||
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
|
||||
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
|
||||
v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
|
||||
sumf += VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
return(sumf);
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
|
||||
#define VSSEG_FLOAT vsseg2e32_v_f32m4
|
||||
#define VSSSEG_FLOAT vssseg2e32_v_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
|
||||
#define VSSEG_FLOAT vsseg2e64_v_f64m4
|
||||
#define VSSSEG_FLOAT vssseg2e64_v_f64m4
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
|
||||
if (n <= 0) return(0);
|
||||
|
||||
FLOAT_V_T vx0, vx1, vy0, vy1;
|
||||
|
||||
if (inc_x == 0 && inc_y == 0) {
|
||||
if (n & 1) {
|
||||
FLOAT temp[2];
|
||||
temp[0] = x[0];
|
||||
temp[1] = x[1];
|
||||
x[0] = y[0];
|
||||
x[1] = y[1];
|
||||
y[0] = temp[0];
|
||||
y[1] = temp[1];
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else if(inc_x == 0) {
|
||||
FLOAT temp[2];
|
||||
temp[0] = x[0];
|
||||
temp[1] = x[1];
|
||||
x[0] = y[(n - 1) * inc_y * 2];
|
||||
x[0] = y[(n - 1) * inc_y * 2 + 1];
|
||||
FLOAT* ptr = y + (n - 1) * inc_y * 2; // start from the last one
|
||||
BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse
|
||||
BLASLONG m = n - 1;
|
||||
for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) {
|
||||
vl = VSETVL(m);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl);
|
||||
VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl);
|
||||
}
|
||||
y[0] = temp[0];
|
||||
y[1] = temp[1];
|
||||
}
|
||||
else if(inc_y == 0) {
|
||||
FLOAT temp[2];
|
||||
temp[0] = y[0];
|
||||
temp[1] = y[1];
|
||||
y[0] = x[(n - 1) * inc_x * 2];
|
||||
y[0] = x[(n - 1) * inc_x * 2 + 1];
|
||||
FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one
|
||||
BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse
|
||||
BLASLONG m = n - 1;
|
||||
for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) {
|
||||
vl = VSETVL(m);
|
||||
VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl);
|
||||
VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl);
|
||||
}
|
||||
x[0] = temp[0];
|
||||
x[1] = temp[1];
|
||||
}
|
||||
else if(inc_x == 1 && inc_y == 1) {
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
VSSEG_FLOAT(y, vx0, vx1, vl);
|
||||
VSSEG_FLOAT(x, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else if (inc_x == 1){
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSEG_FLOAT(&vx0, &vx1, x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
|
||||
VSSEG_FLOAT(x, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else if (inc_y == 1){
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSEG_FLOAT(&vy0, &vy1, y, vl);
|
||||
|
||||
VSSEG_FLOAT(y, vx0, vx1, vl);
|
||||
VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
|
||||
vl = VSETVL(n);
|
||||
|
||||
VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
|
||||
VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
|
||||
|
||||
VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
|
||||
VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,596 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEG4_FLOAT vlseg4e32_v_f32m2
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m2
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEG4_FLOAT vlseg4e64_v_f64m2
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m2
|
||||
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
|
||||
/********************************
|
||||
ADD1 a*c
|
||||
ADD2 b*c
|
||||
ADD3 a*d
|
||||
ADD4 b*d
|
||||
*********************************/
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
|
||||
FLOAT* C,BLASLONG ldc, BLASLONG offset)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1;
|
||||
BLASLONG off, temp;
|
||||
|
||||
FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
|
||||
FLOAT_V_T_M1 v_m1_res0, v_m1_res1;
|
||||
FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
|
||||
size_t vl;
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#else
|
||||
off = 0;
|
||||
#endif
|
||||
|
||||
for (j = bn/2; j > 0; j--)
|
||||
{
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
C0 = C;
|
||||
C1 = C0+2*ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm/2; i > 0; i--)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb+off*2*2;
|
||||
#endif
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres4 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 2;
|
||||
#else
|
||||
temp = off + 2;
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
||||
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
||||
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
|
||||
vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl);
|
||||
vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
|
||||
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
||||
vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
|
||||
vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl);
|
||||
vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
||||
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
||||
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
|
||||
vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
|
||||
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
||||
vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
|
||||
vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
||||
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
||||
vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
|
||||
vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
|
||||
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
||||
vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
|
||||
vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
||||
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
||||
vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
|
||||
vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
|
||||
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
||||
vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
|
||||
vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
|
||||
|
||||
#endif
|
||||
ptrba += vl * 4;
|
||||
ptrbb += vl * 4;
|
||||
}
|
||||
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C0[0] = res0 * alphar - res1 * alphai;
|
||||
C0[1] = res1 * alphar + res0 * alphai;
|
||||
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C0[2] = res0 * alphar - res1 * alphai;
|
||||
C0[3] = res1 * alphar + res0 * alphai;
|
||||
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C1[0] = res0 * alphar - res1 * alphai;
|
||||
C1[1] = res1 * alphar + res0 * alphai;
|
||||
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C1[2] = res0 * alphar - res1 * alphai;
|
||||
C1[3] = res1 * alphar + res0 * alphai;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*2*2;
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
C0 = C0+4;
|
||||
C1 = C1+4;
|
||||
}
|
||||
|
||||
if (bm & 1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2*2;
|
||||
#endif
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
|
||||
vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
|
||||
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
|
||||
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
|
||||
vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
|
||||
|
||||
#endif
|
||||
ptrba += vl * 2;
|
||||
ptrbb += vl * 4;
|
||||
}
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C0[0] = res0 * alphar - res1 * alphai;
|
||||
C0[1] = res1 * alphar + res0 * alphai;
|
||||
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C1[0] = res0 * alphar - res1 * alphai;
|
||||
C1[1] = res1 * alphar + res0 * alphai;
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
k = (bk<<2);
|
||||
bb = bb+k;
|
||||
i = (ldc<<2);
|
||||
C = C+i;
|
||||
}
|
||||
|
||||
if (bn & 1)
|
||||
{
|
||||
C0 = C;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm/2; i > 0; i--)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb+off*2;
|
||||
#endif
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 2;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
||||
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
||||
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
|
||||
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
||||
|
||||
#endif
|
||||
ptrba += vl * 4;
|
||||
ptrbb += vl * 2;
|
||||
}
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C0[0] = res0 * alphar - res1 * alphai;
|
||||
C0[1] = res1 * alphar + res0 * alphai;
|
||||
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
C0[2] = res0 * alphar - res1 * alphai;
|
||||
C0[3] = res1 * alphar + res0 * alphai;
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
C0 = C0+4;
|
||||
}
|
||||
|
||||
if (bm & 1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 1;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
||||
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
#endif
|
||||
ptrba += vl * 2;
|
||||
ptrbb += vl * 2;
|
||||
|
||||
}
|
||||
|
||||
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
||||
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
||||
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
||||
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
||||
|
||||
C0[0] = res0 * alphar - res1 * alphai;
|
||||
C0[1] = res1 * alphar + res0 * alphai;
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2;
|
||||
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
}
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
return 0;
|
||||
}
|
44
param.h
44
param.h
|
@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(x280)
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2
|
||||
|
||||
/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
|
||||
* Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
|
||||
* If VLMAX size is ever more than 1024, this should be increased also. */
|
||||
#define SGEMM_DEFAULT_UNROLL_MN 32
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4
|
||||
#define DGEMM_DEFAULT_UNROLL_MN 32
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 160
|
||||
#define DGEMM_DEFAULT_P 160
|
||||
#define CGEMM_DEFAULT_P 96
|
||||
#define ZGEMM_DEFAULT_P 64
|
||||
|
||||
#define SGEMM_DEFAULT_Q 240
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 120
|
||||
#define ZGEMM_DEFAULT_Q 120
|
||||
|
||||
#define SGEMM_DEFAULT_R 12288
|
||||
#define DGEMM_DEFAULT_R 8192
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#define SYMV_P 16
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
|
||||
#endif
|
||||
#ifdef C910V
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
|
|
Loading…
Reference in New Issue